pax_global_header00006660000000000000000000000064133732254440014521gustar00rootroot0000000000000052 comment=ef9a9016ce2b622fde7102151ec9ecf7b2cca5fc davs2-1.6/000077500000000000000000000000001337322544400124065ustar00rootroot00000000000000davs2-1.6/.gitattributes000066400000000000000000000037721337322544400153120ustar00rootroot00000000000000#common settings that generally should always be used with your language specific settings # Auto detect text files and perform LF normalization # http://davidlaing.com/2012/09/19/customise-your-gitattributes-to-become-a-git-ninja/ * text=auto # # The above will handle all files NOT found below # # Scripts *.bat text eol=crlf *.cmd text eol=crlf *.ps1 text eol=crlf *.sh text eol=lf # Documents *.doc diff=astextplain *.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.ppt diff=astextplain *.PPT diff=astextplain *.pptx diff=astextplain *.PPTX diff=astextplain *.pdf diff=astextplain *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain *.md text *.adoc text *.textile text *.mustache text *.csv text *.tab text *.tsv text *.sql text # Graphics *.png binary *.jpg binary *.jpeg binary *.gif binary *.tif binary *.tiff binary *.ico binary # SVG treated as an asset (binary) by default. If you want to treat it as text, # comment-out the following line and uncomment the line after. *.svg binary #*.svg text *.eps binary #sources *.c text eol=crlf *.cc text eol=crlf *.cxx text eol=crlf *.cpp text eol=crlf *.c++ text eol=crlf *.hpp text eol=crlf *.h text eol=crlf *.h++ text eol=crlf *.hh text eol=crlf *.asm text eol=crlf *.S text eol=crlf *.cfg text eol=crlf *.txt text eol=lf # QT Project files *.pro text eol=lf # Compiled Object files *.slo binary *.lo binary *.o binary *.obj binary # Precompiled Headers *.gch binary *.pch binary # Compiled Dynamic libraries *.so binary *.dylib binary *.dll binary # Compiled Static libraries *.lai binary *.la binary *.a binary *.lib binary # Executables *.exe binary *.out binary *.app binary # Custom for Visual Studio *.sln text eol=crlf *.csproj text eol=crlf *.vbproj text eol=crlf *.fsproj text eol=crlf *.dbproj text eol=crlf *.vcproj text eol=crlf *.vcxproj text eol=crlf *.sln text eol=crlf *.vcxitems text eol=crlf *.props text eol=crlf *.filters text eol=crlf davs2-1.6/.github/000077500000000000000000000000001337322544400137465ustar00rootroot00000000000000davs2-1.6/.github/ISSUE_TEMPLATE/000077500000000000000000000000001337322544400161315ustar00rootroot00000000000000davs2-1.6/.github/ISSUE_TEMPLATE/----.md000066400000000000000000000002671337322544400170230ustar00rootroot00000000000000--- name: 问题咨询 about: 使用问题/安全问题/其他问题 --- 请发送邮件至: sswang@pku.edu.cn 或在应用内“高级设置” - “建议反馈” 填写表单 davs2-1.6/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000021101337322544400206150ustar00rootroot00000000000000--- name: Bug report about: Create a report of bug / 如果你认为你发现了一项代码问题 --- **Describe the bug** A clear and concise description of what the bug is. 请详细的描述这个bug的细节 **To Reproduce** Steps to reproduce the behavior (including the commond line parameters) 请详细描述重现这个bug的步骤(运行的命令行参数、输入的文件) **Expected behavior** A clear and concise description of what you expected to happen. 你认为这个功能本应如何工作 **Screenshots** If applicable, add screenshots to help explain your problem. 如果有可能,请提供截图 **Desktop (please complete the following information):** - OS: [e.g. Windows10, Ubuntu 18.04] - Compiler [e.g. Visual Studio 2013, GCC 5.6.0] - yasm [e.g. 1.2.0, 1.3.0-luofl] 你的操作系统(包括版本)、编译器(GCC/G++, VS)、汇编器yasm(版本号)。 **Additional context** Add any other context about the problem here, i.e. video sequences and bitstreams. 额外的材料,例如输入的视频序列、码流文件等。 davs2-1.6/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000003441337322544400216570ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project / 功能请求 --- 请详细填写以下四项关键元素 ## 功能描述 ## 功能带来的效应 ## 缺少此功能的影响 ## 实现的思路与方式 davs2-1.6/.gitignore000066400000000000000000000005131337322544400143750ustar00rootroot00000000000000Debug/ Release/ x64_Debug/ x64_Release/ build/linux/cavs2dec* My*/ *.user *.suo *.ncb *.aps *.pdb *.res *.dat *.manifest *.map *.dep *.idb *.ilk *.htm *.exp *.lib *.obj *.dll* *.exe *.avs *.mkv *.mp4 *.y4m *.yuv *.log *.bak *.o *.a *.so *.cd *.sdf *.opensdf *.depend *.pc *.mak *.so.* *.dec *.txt config.h *.iobj *.ipdb version.h davs2-1.6/COPYING000066400000000000000000000433111337322544400134430ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. This program is also available under a commercial proprietary license. For more information, contact us at sswang @ pku.edu.cn. davs2-1.6/README.md000066400000000000000000000071301337322544400136660ustar00rootroot00000000000000# davs2 **davs2** is an open-source decoder of `AVS2-P2/IEEE1857.4` video coding standard. An encoder, **xavs2**, can be found at [Github][2] or [Gitee (mirror in China)][3]. [![GitHub tag](https://img.shields.io/github/tag/pkuvcl/davs2.svg?style=plastic)]() [![GitHub issues](https://img.shields.io/github/issues/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/issues) [![GitHub forks](https://img.shields.io/github/forks/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/network) [![GitHub stars](https://img.shields.io/github/stars/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/stargazers) Stargazers over time [![Stargazers over time](https://starcharts.herokuapp.com/pkuvcl/davs2.svg)](https://starcharts.herokuapp.com/pkuvcl/davs2) ## Compile it ### Windows Use VS2013 or latest version of visual studio open the `./build/vs2013/davs2.sln` solution and set the `davs2` as the start project. #### Notes 1. A `shell executor`, i.e. the bash in git for windows, is needed and should be found in `PATH` variable. For example, the path `C:\Program Files\Git\bin` can be added if git-for-windows is installed. 2. `vsyasm` is needed and `1.2.0` is suggested for windows platform. It can be downloaded through: http://yasm.tortall.net/Download.html . A later version `1.3.0` (unofficial revision, please read the instructions of `yasm` to build it for your work), can be found in https://github.com/luofalei/yasm/tree/vs2013 . The installation of `vsyasm` is as follows (if you were using `VS2013`): ``` (1) Copy `vsyasm.exe` to the following directory, "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\" (2) Copy the other 3 files in `vsyasm` to the `MSBuild template` directorty, as follows, "C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V120\BuildCustomizations\" (3) Re-open the solution. ``` ### Linux ``` $ cd build/linux $ ./configure $ make ``` ## Try it Decode AVS2 stream `test.avs` with `1` thread and output to a *YUV file* named `dec.yuv`. ``` ./davs2 -i test.avs -t 1 -o dec.yuv ``` Decode AVS2 stream `test.avs` and display the decoding result via *ffplay*. ``` ./davs2 -i test.avs -t 1 -o stdout | ffplay -i - ``` ### Parameter Instructions | Parameter | Alias | Result | | :--------: | :---------: | :--------------: | | --input=test.avs | -i test.avs | Setting the input bitstream file | | --output=dec.yuv | -o dec.yuv | Setting the output YUV file | | --psnr=rec.yuv | -r rec.yuv | Setting the reference reconstruction YUV file | | --threads=N | -t N | Setting the threads for decoding (default: 1) | | --md5=M | -m M | Reference MD5, used to check whether the output YUV is right | | --verbose | -v | Enable decoding status every frame (Default: Enabled) | | --help | -h | Showing this instruction | ## Issue and Pull Request [Issues should be reported here][6]。 If you have some bugs fixed or features implemented, and would like to share with the public, please [make a Pull Request][7]. ## Homepages [PKU-VCL][1] `AVS2-P2/IEEE1857.4` Encoder: [xavs2 (Github)][2], [xavs2 (mirror in China)][3] `AVS2-P2/IEEE1857.4` Decoder: [davs2 (Github)][4], [davs2 (mirror in China)][5] [1]: http://vcl.idm.pku.edu.cn/ "PKU-VCL" [2]: https://github.com/pkuvcl/xavs2 "xavs2 github repository" [3]: https://gitee.com/pkuvcl/xavs2 "xavs2 gitee repository" [4]: https://github.com/pkuvcl/davs2 "davs2 decoder@github" [5]: https://gitee.com/pkuvcl/davs2 "davs2 decoder@gitee" [6]: https://github.com/pkuvcl/davs2/issues "report issues" [7]: https://github.com/pkuvcl/davs2/pulls "pull request" davs2-1.6/README.zh.md000066400000000000000000000101421337322544400143030ustar00rootroot00000000000000# davs2 遵循 `AVS2-P2/IEEE1857.4` 视频编码标准的解码器. 对应的编码器 **xavs2** 可在 [Github][2] 或 [Gitee (mirror in China)][3] 上找到. [![GitHub tag](https://img.shields.io/github/tag/pkuvcl/davs2.svg?style=plastic)]() [![GitHub issues](https://img.shields.io/github/issues/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/issues) [![GitHub forks](https://img.shields.io/github/forks/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/network) [![GitHub stars](https://img.shields.io/github/stars/pkuvcl/davs2.svg)](https://github.com/pkuvcl/davs2/stargazers) [![Stargazers over time](https://starcharts.herokuapp.com/pkuvcl/davs2.svg)](https://starcharts.herokuapp.com/pkuvcl/davs2) ## 编译方法 ### Windows 可使用`VS2013`打开解决方案`./build/win32/DAVS2.sln`进行编译, 也可以使用更新的vs版本打开上述解决方案. 打开解决方案后, 将工程`davs2`设置为启动项, 进行编译即可. #### 注意 1. 首次编译本项目时, 需要安装一个 `shell 执行器`, 比如 `git-for-windows` 中的 `bash`, 需要将该 `bash` 所在的目录添加到系统环境变量 `PATH` 中. 如上所述, 如果您以默认配置安装了`git-for-windows`, 那么将 `C:\Program Files\Git\bin` 添加到环境变量中即可. 2. 需要安装 `vsyasm`, 我们建议的版本号是 `1.2.0`, 因为官方更新的版本存在编译问题. 下载地址: http://yasm.tortall.net/Download.html . 一个修改过可以正常编译的 `1.3.0` 版本(注意:此修改非官方, 编译请参考yasm的编译指南)可以在这里找到: https://github.com/luofalei/yasm/tree/vs2013 . 其典型的安装步骤如下(使用VS2013时): ``` (1) 将vsyasm.exe文件拷贝到如下目录: "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\" (2) 将剩余三个vsyasm文件拷贝到MSBuild模板目录: "C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V120\BuildCustomizations\" (3) 重新打开VS2013, asmopt工程应已正常加载, 编译无错误. ``` ### Linux 对于linux系统, 依次执行如下命令即可完成编译: ``` $ cd build/linux $ ./configure $ make ``` ## 运行和测试 使用`1`个线程解码AVS2码流文件`test.avs`并将结果输出成YUV文件`dec.yuv`: ``` ./davs2 -i test.avs -t 1 -o dec.yuv ``` 解码AVS2码流文件`test.avs`并用ffplay播放显示: ``` ./davs2 -i test.avs -t 1 -o stdout | ffplay -i - ``` ### 参数说明 | 参数 | 等价形式 | 意义 | | :--------: | :---------: | :--------------: | | --input=test.avs | -i test.avs | 设置输入码流文件路径 | | --output=dec.yuv | -o dec.yuv | 设置输出解码YUV文件路径 | | --psnr=rec.yuv | -r rec.yuv | 设置参考用YUV文件路径, 用于计算PSNR以确定是否匹配 | | --threads=N | -t N | 设置解码线程数 (默认值: 1) | | --md5=M | -m M | 设置参考MD5值, 用于验证输出的重构YUV是否匹配 | | --verbose | -v | 设置每帧是否输出 (默认: 开启) | | --help | -h | 显示此输出命令 | ## Issue & Pull Request 欢迎提交 issue,请写清楚遇到问题的环境与运行参数,包括操作系统环境、编译器环境等。 如果可能提供原始输入`YUV/码流文件`,请尽量提供以方便更快地重现结果。 [反馈问题的 issue 请按照模板格式填写][6]。 如果有开发能力,建议在本地调试出错的代码,并[提供相应修正的 Pull Request][7]。 ## 主页链接 [北京大学-视频编码算法研究室(PKU-VCL)][1] `AVS2-P2/IEEE1857.4` Encoder: [xavs2 (Github)][2], [xavs2 (mirror in China)][3] `AVS2-P2/IEEE1857.4` Decoder: [davs2 (Github)][4], [davs2 (mirror in China)][5] [1]: http://vcl.idm.pku.edu.cn/ "PKU-VCL" [2]: https://github.com/pkuvcl/xavs2 "xavs2 github repository" [3]: https://gitee.com/pkuvcl/xavs2 "xavs2 gitee repository" [4]: https://github.com/pkuvcl/davs2 "davs2 decoder@github" [5]: https://gitee.com/pkuvcl/davs2 "davs2 decoder@gitee" [6]: https://github.com/pkuvcl/davs2/issues "report issues" [7]: https://github.com/pkuvcl/davs2/pulls "pull request" davs2-1.6/build/000077500000000000000000000000001337322544400135055ustar00rootroot00000000000000davs2-1.6/build/linux/000077500000000000000000000000001337322544400146445ustar00rootroot00000000000000davs2-1.6/build/linux/Makefile000066400000000000000000000247711337322544400163170ustar00rootroot00000000000000# Makefile include config.mak vpath %.cc $(SRCPATH) vpath %.c $(SRCPATH) vpath %.h $(SRCPATH) vpath %.S $(SRCPATH) vpath %.asm $(SRCPATH) vpath %.rc $(SRCPATH) CFLAGS += -I$(SRCPATH) -I$(SRCPATH)/.. \ -I$(SRCPATH)/x86 \ -I$(SRCPATH)/vec GENERATED = all: default default: SRCS = common/aec.cc common/alf.cc \ common/bitstream.cc common/block_info.cc \ common/common.cc common/davs2.cc common/cpu.cc common/cu.cc \ common/deblock.cc common/decoder.cc \ common/frame.cc common/header.cc \ common/intra.cc common/mc.cc \ common/memory.cc \ common/pixel.cc common/predict.cc \ common/quant.cc \ common/sao.cc common/transform.cc \ common/primitives.cc \ common/threadpool.cc common/win32thread.cc SRCCLI = test/test.c SRCSO = OBJS = OBJAVX = OBJSO = OBJCLI = #OBJCHK = tools/checkasm.o ## CONFIG: $(shell cat config.h) ## ## ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),) ## SRCS += common/threadpool.cc ## endif ## ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),) ## SRCS += common/win32thread.cc ## endif # MMX/SSE optims ifneq ($(AS),) # asm -------------------------------------------------------------- X86SRC = common/x86/const-a.asm \ common/x86/blockcopy8.asm \ common/x86/cpu-a.asm \ common/x86/dct8.asm \ common/x86/mc-a2.asm \ common/x86/pixeladd8.asm \ common/x86/quant8.asm ifeq ($(SYS_ARCH),X86) ARCH_X86 = yes ASMSRC = $(X86SRC) endif ## Until now, we do not have 64-bit asm ifeq ($(SYS_ARCH),X86_64) ARCH_X86 = yes SRCS += common/vec/intrinsic.cc \ common/vec/intrinsic_alf.cc \ common/vec/intrinsic_sao.cc \ common/vec/intrinsic_deblock.cc \ common/vec/intrinsic_intra-filledge.cc \ common/vec/intrinsic_intra-pred.cc \ common/vec/intrinsic_inter_pred.cc \ common/vec/intrinsic_idct.cc \ common/vec/intrinsic_pixel.cc SRCSAVX = common/vec/intrinsic_sao_avx2.cc \ common/vec/intrinsic_deblock_avx2.cc \ common/vec/intrinsic_intra-pred_avx2.cc \ common/vec/intrinsic_inter_pred_avx2.cc \ common/vec/intrinsic_pixel_avx.cc \ common/vec/intrinsic_idct_avx2.cc CFLAGS += -mmmx -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -msse4a -mssse3 -mavx # ASMSRC = $(X86SRC:-32.asm=-64.asm) ASMSRC = $(X86SRC) ASFLAGS += -DARCH_X86_64=1 OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm endif ifdef ARCH_X86 ASFLAGS += -I$(SRCPATH)/x86/ #SRCS += x86/mc-c.cc x86/predict-c.cc OBJASM = $(ASMSRC:%.asm=%.o) $(OBJASM): common/x86/x86inc.asm common/x86/x86util.asm endif # AltiVec optims ifeq ($(SYS_ARCH),PPC) SRCS += common/ppc/mc.cc common/ppc/pixel.cc common/ppc/dct.cc \ common/ppc/quant.cc common/ppc/deblock.cc \ common/ppc/predict.cc endif # NEON optims ifeq ($(SYS_ARCH),ARM) # x264 ARM asm sources # ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ # common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \ # common/arm/predict-a.S common/arm/bitstream-a.S # SRCS += common/arm/mc-c.cc common/arm/predict-c.cc # x265 ARM asm sources ASMSRC += common/arm/blockcopy8.S common/arm/cpu-a.S common/arm/dct-a.S \ common/arm/ipfilter8.S common/arm/mc-a.S common/arm/pixel-util.S \ common/arm/sad-a.S common/arm/ssd-a.S OBJASM = $(ASMSRC:%.S=%.o) endif # AArch64 NEON optims ifeq ($(SYS_ARCH),AARCH64) ASMSRC += common/aarch64/bitstream-a.S \ common/aarch64/cabac-a.S \ common/aarch64/dct-a.S \ common/aarch64/deblock-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ common/aarch64/quant-a.S SRCS += common/aarch64/asm-offsets.cc \ common/aarch64/mc-c.cc \ common/aarch64/predict-c.cc OBJASM = $(ASMSRC:%.S=%.o) OBJCHK += tools/checkasm-aarch64.o endif # MSA optims ifeq ($(SYS_ARCH),MIPS) ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),) SRCS += common/mips/mc-c.cc common/mips/dct-c.cc \ common/mips/deblock-c.cc common/mips/pixel-c.cc \ common/mips/predict-c.cc common/mips/quant-c.cc endif endif # asm -------------------------------------------------------------- endif # here ends ifneq ($(AS),) ifneq ($(HAVE_GETOPT_LONG),1) SRCS += compat/getopt/getopt.cc endif ## Windows Dll ## ifeq ($(SYS), WINDOWS) ## # OBJCLI += $(if $(RC), davs2res.o) ## ifneq ($(SONAME),) ## SRCSO += davs2dll.cc ## OBJSO += $(if $(RC), davs2res.dll.o) ## endif ## endif OBJS += $(SRCS:%.cc=%.o) OBJAVX += $(SRCSAVX:%.cc=%.o) OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.cc=%.o) .PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* etags cli: davs2$(EXE) lib-static: $(LIBDAVS2) lib-shared: $(SONAME) $(LIBDAVS2): $(GENERATED) .depend $(OBJS) $(OBJAVX) $(OBJASM) @echo "\033[33m [linking static] $(LIBDAVS2) \033[0m" rm -f $(LIBDAVS2) $(AR)$@ $(OBJS) $(OBJAVX) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) $(SONAME): $(GENERATED) .depend $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) @echo "\033[33m [linking shared] $(SONAME) \033[0m" $(LD)$@ $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) ifneq ($(EXE),) .PHONY: davs2 checkasm davs2: davs2$(EXE) checkasm: checkasm$(EXE) endif davs2$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBDAVS2) @echo "\033[33m [linking execution] davs2$(EXE) \033[0m" $(LD)$@ $(OBJCLI) $(CLI_LIBDAVS2) $(LDFLAGSCLI) $(LDFLAGS) checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBDAVS2) @echo "\033[33m [linking checkasm] checkasm$(EXE) \033[0m" $(LD)$@ $(OBJCHK) $(LIBDAVS2) $(LDFLAGS) $(OBJS) $(OBJAVX) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm @echo "\033[33m [Compiling asm]: $< \033[0m" $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.o: %.S @echo "\033[33m [Compiling asm]: $< \033[0m" $(AS) $(ASFLAGS) -o $@ $< -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %.dll.o: %.rc davs2.h @echo "\033[33m [Compiling dll]: $< \033[0m" $(RC) $(RCFLAGS)$@ -DDLL $< %.o: %.rc davs2.h @echo "\033[33m [Compiling rc]: $< \033[0m" $(RC) $(RCFLAGS)$@ $< $(OBJAVX): @echo "\033[33m [Compiling]: $(@:.o=.cc) \033[0m" $(CC) $(CFLAGS) -mavx2 -c -o $@ $(SRCPATH)/$(@:.o=.cc) %.o: %.cc @echo "\033[33m [Compiling]: $< \033[0m" $(CC) $(CFLAGS) -c -o $@ $< %.o: %.c @echo "\033[33m [Compiling]: $< \033[0m" $(CC) $(CFLAGS) -c -o $@ $< .depend: config.mak @rm -f .depend @echo "\033[33m dependency file generation... \033[0m" ifeq ($(COMPILER),CL) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.cc=%.o)" 1>> .depend;) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCSAVX)), $(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$(SRC)" "$(SRC:$(SRCPATH)/%.cc=%.o)" 1>> .depend;) else @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.cc=%.o) $(DEPMM) 1>> .depend;) @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCSAVX)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.cc=%.o) $(DEPMM) 1>> .depend;) endif config.mak: ./configure depend: .depend ifneq ($(wildcard .depend),) include .depend endif SRC2 = $(SRCS) $(SRCCLI) # These should cover most of the important codepaths OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50 OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500 OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4 OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac ifeq (,$(VIDS)) fprofiled: @echo 'usage: make fprofiled VIDS="infile1 infile2 ..."' @echo 'where infiles are anything that davs2 understands,' @echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.' else fprofiled: $(MAKE) clean $(MAKE) davs2$(EXE) CFLAGS="$(CFLAGS) $(PROF_GEN_CC)" LDFLAGS="$(LDFLAGS) $(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./davs2$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) ifeq ($(COMPILER),CL) # Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted rm -f davs2$(EXE) else rm -f $(SRC2:%.cc=%.o) endif $(MAKE) CFLAGS="$(CFLAGS) $(PROF_USE_CC)" LDFLAGS="$(LDFLAGS) $(PROF_USE_LD)" rm -f $(SRC2:%.cc=%.gcda) $(SRC2:%.cc=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) rm -f *.a *.lib *.exp *.pdb libdavs2.so* davs2 davs2.exe .depend TAGS rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) davs2_lookahead.cclbin rm -f example example.exe $(OBJEXAMPLE) rm -f $(SRC2:%.cc=%.gcda) $(SRC2:%.cc=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean rm -f config.mak davs2_config.h config.h config.log davs2.pc davs2.def conftest* install-cli: cli $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) davs2$(EXE) $(DESTDIR)$(bindir) install-lib-dev: $(INSTALL) -d $(DESTDIR)$(includedir) $(INSTALL) -d $(DESTDIR)$(libdir) $(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig $(INSTALL) -m 644 $(SRCPATH)/davs2.h $(DESTDIR)$(includedir) $(INSTALL) -m 644 davs2_config.h $(DESTDIR)$(includedir) $(INSTALL) -m 644 davs2.pc $(DESTDIR)$(libdir)/pkgconfig install-lib-static: lib-static install-lib-dev $(INSTALL) -m 644 $(LIBDAVS2) $(DESTDIR)$(libdir) $(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBDAVS2)) install-lib-shared: lib-shared install-lib-dev ifneq ($(IMPLIBNAME),) $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir) $(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir) else ifneq ($(SONAME),) ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libdavs2.$(SOSUFFIX) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir) endif uninstall: rm -f $(DESTDIR)$(includedir)/davs2.h $(DESTDIR)$(includedir)/davs2_config.h $(DESTDIR)$(libdir)/libdavs2.a rm -f $(DESTDIR)$(bindir)/davs2$(EXE) $(DESTDIR)$(libdir)/pkgconfig/davs2.pc ifneq ($(IMPLIBNAME),) rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME) else ifneq ($(SONAME),) rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libdavs2.$(SOSUFFIX) endif etags: TAGS TAGS: etags $(SRCS) davs2-1.6/build/linux/config.guess000077500000000000000000001263431337322544400171750ustar00rootroot00000000000000#! /bin/sh # Attempt to guess a canonical system name. # Copyright 1992-2017 Free Software Foundation, Inc. timestamp='2017-05-27' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that # program. This Exception is an additional permission under section 7 # of the GNU General Public License, version 3 ("GPLv3"). # # Originally written by Per Bothner; maintained since 2000 by Ben Elliston. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess # # Please send patches to . me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] Output the configuration name of the system \`$me' is run on. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. Copyright 1992-2017 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" >&2 exit 1 ;; * ) break ;; esac done if test $# != 0; then echo "$me: too many arguments$help" >&2 exit 1 fi trap 'exit 1' 1 2 15 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still # use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. set_cc_for_build=' trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; : ${TMPDIR=/tmp} ; { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in ,,) echo "int x;" > $dummy.c ; for c in cc gcc c89 c99 ; do if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; if test x"$CC_FOR_BUILD" = x ; then CC_FOR_BUILD=no_compiler_found ; fi ;; ,,*) CC_FOR_BUILD=$CC ;; ,*,*) CC_FOR_BUILD=$HOST_CC ;; esac ; set_cc_for_build= ;' # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) if (test -f /.attbin/uname) >/dev/null 2>&1 ; then PATH=$PATH:/.attbin ; export PATH fi UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown case "${UNAME_SYSTEM}" in Linux|GNU|GNU/*) # If the system lacks a compiler, then just pick glibc. # We could probably try harder. LIBC=gnu eval $set_cc_for_build cat <<-EOF > $dummy.c #include #if defined(__UCLIBC__) LIBC=uclibc #elif defined(__dietlibc__) LIBC=dietlibc #else LIBC=gnu #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` ;; esac # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward # compatibility and a consistent mechanism for selecting the # object file format. # # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ /sbin/$sysctl 2>/dev/null || \ /usr/sbin/$sysctl 2>/dev/null || \ echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; earmv*) arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'` endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'` machine=${arch}${endian}-unknown ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently (or will in the future) and ABI. case "${UNAME_MACHINE_ARCH}" in earm*) os=netbsdelf ;; arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? os=netbsd else os=netbsdelf fi ;; *) os=netbsd ;; esac # Determine ABI tags. case "${UNAME_MACHINE_ARCH}" in earm*) expr='s/^earmv[0-9]/-eabi/;s/eb$//' abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"` ;; esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. case "${UNAME_VERSION}" in Debian*) release='-gnu' ;; *) release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. echo "${machine}-${os}${release}${abi}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE} exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} exit ;; *:LibertyBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE} exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; *:SolidBSD:*:*) echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} exit ;; macppc:MirBSD:*:*) echo powerpc-unknown-mirbsd${UNAME_RELEASE} exit ;; *:MirBSD:*:*) echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} exit ;; *:Sortix:*:*) echo ${UNAME_MACHINE}-unknown-sortix exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on # OSF/1 and Tru64 systems produced since 1995. I hope that # covers most systems running today. This code pipes the CPU # types through head -n 1, so we only detect the type of CPU 0. ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") UNAME_MACHINE=alpha ;; "EV4.5 (21064)") UNAME_MACHINE=alpha ;; "LCA4 (21066/21068)") UNAME_MACHINE=alpha ;; "EV5 (21164)") UNAME_MACHINE=alphaev5 ;; "EV5.6 (21164A)") UNAME_MACHINE=alphaev56 ;; "EV5.6 (21164PC)") UNAME_MACHINE=alphapca56 ;; "EV5.7 (21164PC)") UNAME_MACHINE=alphapca57 ;; "EV6 (21264)") UNAME_MACHINE=alphaev6 ;; "EV6.7 (21264A)") UNAME_MACHINE=alphaev67 ;; "EV6.8CB (21264C)") UNAME_MACHINE=alphaev68 ;; "EV6.8AL (21264B)") UNAME_MACHINE=alphaev68 ;; "EV6.8CX (21264D)") UNAME_MACHINE=alphaev68 ;; "EV6.9A (21264/EV69A)") UNAME_MACHINE=alphaev69 ;; "EV7 (21364)") UNAME_MACHINE=alphaev7 ;; "EV7.9 (21364A)") UNAME_MACHINE=alphaev79 ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 exit $exitcode ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead # of the specific Alpha model? echo alpha-pc-interix exit ;; 21064:Windows_NT:50:3) echo alpha-dec-winnt3.5 exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition exit ;; *:z/VM:*:*) echo s390-ibm-zvmoe exit ;; *:OS400:*:*) echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) echo hppa1.1-hitachi-hiuxmpp exit ;; Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. if test "`(/bin/universe) 2>/dev/null`" = att ; then echo pyramid-pyramid-sysv3 else echo pyramid-pyramid-bsd fi exit ;; NILE*:*:*:dcosx) echo pyramid-pyramid-svr4 exit ;; DRS?6000:unix:4.0:6*) echo sparc-icl-nx6 exit ;; DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) case `/usr/bin/uname -p` in sparc) echo sparc-icl-nx7; exit ;; esac ;; s390x:SunOS:*:*) echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4H:SunOS:5.*:*) echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) echo i386-pc-auroraux${UNAME_RELEASE} exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build SUN_ARCH=i386 # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. if [ "$CC_FOR_BUILD" != no_compiler_found ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then SUN_ARCH=x86_64 fi fi echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in Series*|S4*) UNAME_RELEASE=`uname -v` ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` exit ;; sun3*:SunOS:*:*) echo m68k-sun-sunos${UNAME_RELEASE} exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} ;; sun4) echo sparc-sun-sunos${UNAME_RELEASE} ;; esac exit ;; aushp:SunOS:*:*) echo sparc-auspex-sunos${UNAME_RELEASE} exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not # "atarist" or "atariste" at least should have a processor # > m68000). The system name ranges from "MiNT" over "FreeMiNT" # to the lowercase version "mint" (or "freemint"). Finally # the system name "TOS" denotes a system which is actually not # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) echo m68k-milan-mint${UNAME_RELEASE} exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) echo m68k-hades-mint${UNAME_RELEASE} exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) echo m68k-unknown-mint${UNAME_RELEASE} exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; powerpc:machten:*:*) echo powerpc-apple-machten${UNAME_RELEASE} exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) echo mips-dec-ultrix${UNAME_RELEASE} exit ;; VAX*:ULTRIX*:*:*) echo vax-dec-ultrix${UNAME_RELEASE} exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) echo clipper-intergraph-clix${UNAME_RELEASE} exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { #else int main (argc, argv) int argc; char *argv[]; { #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && SYSTEM_NAME=`$dummy $dummyarg` && { echo "$SYSTEM_NAME"; exit; } echo mips-mips-riscos${UNAME_RELEASE} exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax exit ;; Motorola:*:4.3:PL8-*) echo powerpc-harris-powermax exit ;; Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) echo powerpc-harris-powermax exit ;; Night_Hawk:Power_UNIX:*:*) echo powerpc-harris-powerunix exit ;; m88k:CX/UX:7*:*) echo m88k-harris-cxux7 exit ;; m88k:*:4*:R4*) echo m88k-motorola-sysv4 exit ;; m88k:*:3*:R3*) echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ [ ${TARGET_BINARY_INTERFACE}x = x ] then echo m88k-dg-dgux${UNAME_RELEASE} else echo m88k-dg-dguxbcs${UNAME_RELEASE} fi else echo i586-dg-dgux${UNAME_RELEASE} fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; M88*:*:R3*:*) # Delta 88k system running SVR3 echo m88k-motorola-sysv3 exit ;; XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) echo m88k-tektronix-sysv3 exit ;; Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' i*86:AIX:*:*) echo i386-ibm-aix exit ;; ia64:AIX:*:*) if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include main() { if (!__power_pc()) exit(1); puts("powerpc-ibm-aix3.2.5"); exit(0); } EOF if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` then echo "$SYSTEM_NAME" else echo rs6000-ibm-aix3.2.5 fi elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then echo rs6000-ibm-aix3.2.4 else echo rs6000-ibm-aix3.2 fi exit ;; *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc fi if [ -x /usr/bin/lslpp ] ; then IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${IBM_ARCH}-ibm-aix${IBM_REV} exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; ibmrt:4.4BSD:*|romp-ibm:BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx exit ;; DPX/2?00:B.O.S.:*:*) echo m68k-bull-sysv3 exit ;; 9000/[34]??:4.3bsd:1.*:*) echo m68k-hp-bsd exit ;; hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in 9000/31? ) HP_ARCH=m68000 ;; 9000/[34]?? ) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in 32) HP_ARCH=hppa2.0n ;; 64) HP_ARCH=hppa2.0w ;; '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 esac ;; esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #define _HPUX_SOURCE #include #include int main () { #if defined(_SC_KERNEL_BITS) long bits = sysconf(_SC_KERNEL_BITS); #endif long cpu = sysconf (_SC_CPU_VERSION); switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0"); break; case CPU_PA_RISC1_1: puts ("hppa1.1"); break; case CPU_PA_RISC2_0: #if defined(_SC_KERNEL_BITS) switch (bits) { case 64: puts ("hppa2.0w"); break; case 32: puts ("hppa2.0n"); break; default: puts ("hppa2.0"); break; } break; #else /* !defined(_SC_KERNEL_BITS) */ puts ("hppa2.0"); break; #endif default: puts ("hppa1.0"); break; } exit (0); } EOF (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac if [ ${HP_ARCH} = hppa2.0w ] then eval $set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler # generating 64-bit code. GNU and HP use different nomenclature: # # $ CC_FOR_BUILD=cc ./config.guess # => hppa2.0w-hp-hpux11.23 # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then HP_ARCH=hppa2.0w else HP_ARCH=hppa64 fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} exit ;; ia64:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` echo ia64-hp-hpux${HPUX_REV} exit ;; 3050*:HI-UX:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include int main () { long cpu = sysconf (_SC_CPU_VERSION); /* The order matters, because CPU_IS_HP_MC68K erroneously returns true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct results, however. */ if (CPU_IS_PA_RISC (cpu)) { switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; default: puts ("hppa-hitachi-hiuxwe2"); break; } } else if (CPU_IS_HP_MC68K (cpu)) puts ("m68k-hitachi-hiuxwe2"); else puts ("unknown-hitachi-hiuxwe2"); exit (0); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) echo hppa1.0-hp-bsd exit ;; *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then echo ${UNAME_MACHINE}-unknown-osf1mk else echo ${UNAME_MACHINE}-unknown-osf1 fi exit ;; parisc*:Lites*:*:*) echo hppa1.1-hp-lites exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} exit ;; sparc*:BSD/OS:*:*) echo sparc-unknown-bsdi${UNAME_RELEASE} exit ;; *:BSD/OS:*:*) echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` case ${UNAME_PROCESSOR} in amd64) UNAME_PROCESSOR=x86_64 ;; i386) UNAME_PROCESSOR=i586 ;; esac echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin exit ;; *:MINGW64*:*) echo ${UNAME_MACHINE}-pc-mingw64 exit ;; *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; *:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; i*:windows32*:*) # uname -m includes "-pc" on this system. echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; *:Interix*:*) case ${UNAME_MACHINE} in x86) echo i586-pc-interix${UNAME_RELEASE} exit ;; authenticamd | genuineintel | EM64T) echo x86_64-unknown-interix${UNAME_RELEASE} exit ;; IA64) echo ia64-unknown-interix${UNAME_RELEASE} exit ;; esac ;; [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; 8664:Windows_NT:*) echo x86_64-pc-mks exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we # UNAME_MACHINE based on the output of uname instead of i386? echo i586-pc-interix exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; p*:CYGWIN*:*) echo powerpcle-unknown-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; *:GNU:*:*) # the GNU system echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; PCA57) UNAME_MACHINE=alphapca56 ;; EV6) UNAME_MACHINE=alphaev6 ;; EV67) UNAME_MACHINE=alphaev67 ;; EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 if test "$?" = 0 ; then LIBC=gnulibc1 ; fi echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; arc:Linux:*:* | arceb:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then echo ${UNAME_MACHINE}-unknown-linux-${LIBC} else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi else echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf fi fi exit ;; avr32*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; cris:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; crisv32:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; e2k:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; frv:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; hexagon:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:Linux:*:*) echo ${UNAME_MACHINE}-pc-linux-${LIBC} exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; k1om:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m68*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef ${UNAME_MACHINE} #undef ${UNAME_MACHINE}el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=${UNAME_MACHINE}el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=${UNAME_MACHINE} #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; } ;; mips64el:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; openrisc*:Linux:*:*) echo or1k-unknown-linux-${LIBC} exit ;; or32:Linux:*:* | or1k*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; padre:Linux:*:*) echo sparc-unknown-linux-${LIBC} exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) echo hppa64-unknown-linux-${LIBC} exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in PA7*) echo hppa1.1-unknown-linux-${LIBC} ;; PA8*) echo hppa2.0-unknown-linux-${LIBC} ;; *) echo hppa-unknown-linux-${LIBC} ;; esac exit ;; ppc64:Linux:*:*) echo powerpc64-unknown-linux-${LIBC} exit ;; ppc:Linux:*:*) echo powerpc-unknown-linux-${LIBC} exit ;; ppc64le:Linux:*:*) echo powerpc64le-unknown-linux-${LIBC} exit ;; ppcle:Linux:*:*) echo powerpcle-unknown-linux-${LIBC} exit ;; riscv32:Linux:*:* | riscv64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux-${LIBC} exit ;; sh64*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; tile*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; vax:Linux:*:*) echo ${UNAME_MACHINE}-dec-linux-${LIBC} exit ;; x86_64:Linux:*:*) echo ${UNAME_MACHINE}-pc-linux-${LIBC} exit ;; xtensa*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both # sysname and nodename. echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) # Unixware is an offshoot of SVR4, but it has its own version # number series starting with 2... # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. echo ${UNAME_MACHINE}-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) echo ${UNAME_MACHINE}-unknown-stop exit ;; i*86:atheos:*:*) echo ${UNAME_MACHINE}-unknown-atheos exit ;; i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) echo i386-unknown-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} else echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} fi exit ;; i*86:*:5:[678]*) # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ && UNAME_MACHINE=i586 (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 echo ${UNAME_MACHINE}-pc-sco$UNAME_REL else echo ${UNAME_MACHINE}-pc-sysv32 fi exit ;; pc:*:*:*) # Left here for compatibility: # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub # prints for the "djgpp" host, or else GDB configure will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; paragon:*:*:*) echo i860-intel-osf1 exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) # "miniframe" echo m68010-convergent-sysv exit ;; mc68k:UNIX:SYSTEM5:3.51m) echo m68k-convergent-sysv exit ;; M680?0:D-NIX:5.3:*) echo m68k-diab-dnix exit ;; M68*:*:R3V[5678]*:*) test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) OS_REL='' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; NCR*:*:4.2:* | MPRAS*:*:4.2:*) OS_REL='.3' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) echo m68k-unknown-lynxos${UNAME_RELEASE} exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) echo sparc-unknown-lynxos${UNAME_RELEASE} exit ;; rs6000:LynxOS:2.*:*) echo rs6000-unknown-lynxos${UNAME_RELEASE} exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) echo powerpc-unknown-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) echo mips-dde-sysv${UNAME_RELEASE} exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 exit ;; RM*:SINIX-*:*:*) echo mips-sni-sysv4 exit ;; *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` echo ${UNAME_MACHINE}-sni-sysv4 else echo ns32k-sni-sysv fi exit ;; PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says echo i586-unisys-sysv4 exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes . # How about differentiating between stratus architectures? -djm echo hppa1.1-stratus-sysv4 exit ;; *:*:*:FTX*) # From seanf@swdc.stratus.com. echo i860-stratus-sysv4 exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. echo ${UNAME_MACHINE}-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) echo m68k-apple-aux${UNAME_RELEASE} exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then echo mips-nec-sysv${UNAME_RELEASE} else echo mips-unknown-sysv${UNAME_RELEASE} fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. echo powerpc-apple-beos exit ;; BePC:BeOS:*:*) # BeOS running on Intel PC compatible. echo i586-pc-beos exit ;; BePC:Haiku:*:*) # Haiku running on Intel PC compatible. echo i586-pc-haiku exit ;; x86_64:Haiku:*:*) echo x86_64-unknown-haiku exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; SX-5:SUPER-UX:*:*) echo sx5-nec-superux${UNAME_RELEASE} exit ;; SX-6:SUPER-UX:*:*) echo sx6-nec-superux${UNAME_RELEASE} exit ;; SX-7:SUPER-UX:*:*) echo sx7-nec-superux${UNAME_RELEASE} exit ;; SX-8:SUPER-UX:*:*) echo sx8-nec-superux${UNAME_RELEASE} exit ;; SX-8R:SUPER-UX:*:*) echo sx8r-nec-superux${UNAME_RELEASE} exit ;; SX-ACE:SUPER-UX:*:*) echo sxace-nec-superux${UNAME_RELEASE} exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; *:Rhapsody:*:*) echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown eval $set_cc_for_build if test "$UNAME_PROCESSOR" = unknown ; then UNAME_PROCESSOR=powerpc fi if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then if [ "$CC_FOR_BUILD" != no_compiler_found ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then case $UNAME_PROCESSOR in i386) UNAME_PROCESSOR=x86_64 ;; powerpc) UNAME_PROCESSOR=powerpc64 ;; esac fi # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_PPC >/dev/null then UNAME_PROCESSOR=powerpc fi fi elif test "$UNAME_PROCESSOR" = i386 ; then # Avoid executing cc on OS X 10.9, as it ships with a stub # that puts up a graphical alert prompting to install # developer tools. Any system running Mac OS X 10.7 or # later (Darwin 11 and later) is required to have a 64-bit # processor. This is not true of the ARM version of Darwin # that Apple uses in portable devices. UNAME_PROCESSOR=x86_64 fi echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` if test "$UNAME_PROCESSOR" = x86; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; NEO-*:NONSTOP_KERNEL:*:*) echo neo-tandem-nsk${UNAME_RELEASE} exit ;; NSE-*:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; NSR-*:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; NSX-*:NONSTOP_KERNEL:*:*) echo nsx-tandem-nsk${UNAME_RELEASE} exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; BS2000:POSIX*:*:*) echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. if test "$cputype" = 386; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi echo ${UNAME_MACHINE}-unknown-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-unknown-tops10 exit ;; *:TENEX:*:*) echo pdp10-unknown-tenex exit ;; KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) echo pdp10-dec-tops20 exit ;; XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) echo pdp10-xkl-tops20 exit ;; *:TOPS-20:*:*) echo pdp10-unknown-tops20 exit ;; *:ITS:*:*) echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; esac ;; *:XENIX:*:SysV) echo i386-pc-xenix exit ;; i*86:skyos:*:*) echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'` exit ;; i*86:rdos:*:*) echo ${UNAME_MACHINE}-pc-rdos exit ;; i*86:AROS:*:*) echo ${UNAME_MACHINE}-pc-aros exit ;; x86_64:VMkernel:*:*) echo ${UNAME_MACHINE}-unknown-esx exit ;; amd64:Isilon\ OneFS:*:*) echo x86_64-unknown-onefs exit ;; esac cat >&2 </dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` /bin/uname -X = `(/bin/uname -X) 2>/dev/null` hostinfo = `(hostinfo) 2>/dev/null` /bin/universe = `(/bin/universe) 2>/dev/null` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` /bin/arch = `(/bin/arch) 2>/dev/null` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` UNAME_MACHINE = ${UNAME_MACHINE} UNAME_RELEASE = ${UNAME_RELEASE} UNAME_SYSTEM = ${UNAME_SYSTEM} UNAME_VERSION = ${UNAME_VERSION} EOF exit 1 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: davs2-1.6/build/linux/config.sub000077500000000000000000001072431337322544400166360ustar00rootroot00000000000000#! /bin/sh # Configuration validation subroutine script. # Copyright 1992-2017 Free Software Foundation, Inc. timestamp='2017-04-02' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that # program. This Exception is an additional permission under section 7 # of the GNU General Public License, version 3 ("GPLv3"). # Please send patches to . # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. # If it is invalid, we print an error message on stderr and exit with code 1. # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases # that are meaningful with *any* GNU software. # Each package is responsible for reporting which valid configurations # it does not support. The user should be able to distinguish # a failure to support a valid configuration from a meaningless # configuration. # The goal of this file is to map all the various variations of a given # machine specification into a single specification in the form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM # or in some cases, the newer four-part form: # CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM # It is wrong to echo any other type of specification. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS Canonicalize a configuration name. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.sub ($timestamp) Copyright 1992-2017 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" exit 1 ;; *local*) # First pass through any local machine types. echo $1 exit ;; * ) break ;; esac done case $# in 0) echo "$me: missing argument$help" >&2 exit 1;; 1) ;; *) echo "$me: too many arguments$help" >&2 exit 1;; esac # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \ kopensolaris*-gnu* | cloudabi*-eabi* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; android-linux) os=-linux-android basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown ;; *) basic_machine=`echo $1 | sed 's/-[^-]*$//'` if [ $basic_machine != $1 ] then os=`echo $1 | sed 's/.*-/-/'` else os=; fi ;; esac ### Let's recognize common machines as not being operating systems so ### that things like config.sub decstation-3100 work. We also ### recognize some manufacturers as not being operating systems, so we ### can provide default operating systems below. case $os in -sun*os*) # Prevent following clause from handling this invalid input. ;; -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ -apple | -axis | -knuth | -cray | -microblaze*) os= basic_machine=$1 ;; -bluegene*) os=-cnk ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 ;; -scout) ;; -wrs) os=-vxworks basic_machine=$1 ;; -chorusos*) os=-chorusos basic_machine=$1 ;; -chorusrdb) os=-chorusrdb basic_machine=$1 ;; -hiux*) os=-hiuxwe2 ;; -sco6) os=-sco5v6 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5) os=-sco3.2v5 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5v6*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -udk*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -lynx*178) os=-lynxos178 ;; -lynx*5) os=-lynxos5 ;; -lynx*) os=-lynxos ;; -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; -windowsnt*) os=`echo $os | sed -e 's/windowsnt/winnt/'` ;; -psos*) os=-psos ;; -mint | -mint[0-9]*) basic_machine=m68k-atari os=-mint ;; esac # Decode aliases for certain CPU-COMPANY combinations. case $basic_machine in # Recognize the basic CPU types without company name. # Some are omitted here because they have special meanings below. 1750a | 580 \ | a29k \ | aarch64 | aarch64_be \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ | arc | arceb \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ | ba \ | be32 | be64 \ | bfin \ | c4x | c8051 | clipper \ | d10v | d30v | dlx | dsp16xx \ | e2k | epiphany \ | fido | fr30 | frv | ft32 \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i860 | i960 | ia16 | ia64 \ | ip2k | iq2000 \ | k1om \ | le32 | le64 \ | lm32 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ | mips64octeon | mips64octeonel \ | mips64orion | mips64orionel \ | mips64r5900 | mips64r5900el \ | mips64vr | mips64vrel \ | mips64vr4100 | mips64vr4100el \ | mips64vr4300 | mips64vr4300el \ | mips64vr5000 | mips64vr5000el \ | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ | mipsisa32r6 | mipsisa32r6el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64r6 | mipsisa64r6el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ | mipsr5900 | mipsr5900el \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ | nios | nios2 | nios2eb | nios2el \ | ns16k | ns32k \ | open8 | or1k | or1knd | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pru \ | pyramid \ | riscv32 | riscv64 \ | rl78 | rx \ | score \ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ | spu \ | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | visium \ | wasm32 \ | we32k \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown ;; c54x) basic_machine=tic54x-unknown ;; c55x) basic_machine=tic55x-unknown ;; c6x) basic_machine=tic6x-unknown ;; leon|leon[3-9]) basic_machine=sparc-$basic_machine ;; m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip) basic_machine=$basic_machine-unknown os=-none ;; m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) ;; ms1) basic_machine=mt-unknown ;; strongarm | thumb | xscale) basic_machine=arm-unknown ;; xgate) basic_machine=$basic_machine-unknown os=-none ;; xscaleeb) basic_machine=armeb-unknown ;; xscaleel) basic_machine=armel-unknown ;; # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) basic_machine=$basic_machine-pc ;; # Object if more than one company name word. *-*-*) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. 580-* \ | a29k-* \ | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | ba-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ | c8051-* | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | e2k-* | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ | i*86-* | i860-* | i960-* | ia16-* | ia64-* \ | ip2k-* | iq2000-* \ | k1om-* \ | le32-* | le64-* \ | lm32-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ | microblaze-* | microblazeel-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ | mips64octeon-* | mips64octeonel-* \ | mips64orion-* | mips64orionel-* \ | mips64r5900-* | mips64r5900el-* \ | mips64vr-* | mips64vrel-* \ | mips64vr4100-* | mips64vr4100el-* \ | mips64vr4300-* | mips64vr4300el-* \ | mips64vr5000-* | mips64vr5000el-* \ | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ | mipsisa32r6-* | mipsisa32r6el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64r6-* | mipsisa64r6el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ | mipsr5900-* | mipsr5900el-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ | nios-* | nios2-* | nios2eb-* | nios2el-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | or1k*-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ | pru-* \ | pyramid-* \ | riscv32-* | riscv64-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ | tron-* \ | ubicom32-* \ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | visium-* \ | wasm32-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ | ymp-* \ | z8k-* | z80-*) ;; # Recognize the basic CPU types without company name, with glob match. xtensa*) basic_machine=$basic_machine-unknown ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) basic_machine=i386-unknown os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) basic_machine=m68000-att ;; 3b*) basic_machine=we32k-att ;; a29khif) basic_machine=a29k-amd os=-udi ;; abacus) basic_machine=abacus-unknown ;; adobe68k) basic_machine=m68010-adobe os=-scout ;; alliant | fx80) basic_machine=fx80-alliant ;; altos | altos3068) basic_machine=m68k-altos ;; am29k) basic_machine=a29k-none os=-bsd ;; amd64) basic_machine=x86_64-pc ;; amd64-*) basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl os=-sysv ;; amiga | amiga-*) basic_machine=m68k-unknown ;; amigaos | amigados) basic_machine=m68k-unknown os=-amigaos ;; amigaunix | amix) basic_machine=m68k-unknown os=-sysv4 ;; apollo68) basic_machine=m68k-apollo os=-sysv ;; apollo68bsd) basic_machine=m68k-apollo os=-bsd ;; aros) basic_machine=i386-pc os=-aros ;; asmjs) basic_machine=asmjs-unknown ;; aux) basic_machine=m68k-apple os=-aux ;; balance) basic_machine=ns32k-sequent os=-dynix ;; blackfin) basic_machine=bfin-unknown os=-linux ;; blackfin-*) basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; bluegene*) basic_machine=powerpc-ibm os=-cnk ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c55x-*) basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c6x-*) basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c90) basic_machine=c90-cray os=-unicos ;; cegcc) basic_machine=arm-unknown os=-cegcc ;; convex-c1) basic_machine=c1-convex os=-bsd ;; convex-c2) basic_machine=c2-convex os=-bsd ;; convex-c32) basic_machine=c32-convex os=-bsd ;; convex-c34) basic_machine=c34-convex os=-bsd ;; convex-c38) basic_machine=c38-convex os=-bsd ;; cray | j90) basic_machine=j90-cray os=-unicos ;; craynv) basic_machine=craynv-cray os=-unicosmp ;; cr16 | cr16-*) basic_machine=cr16-unknown os=-elf ;; crds | unos) basic_machine=m68k-crds ;; crisv32 | crisv32-* | etraxfs*) basic_machine=crisv32-axis ;; cris | cris-* | etrax*) basic_machine=cris-axis ;; crx) basic_machine=crx-unknown os=-elf ;; da30 | da30-*) basic_machine=m68k-da30 ;; decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) basic_machine=mips-dec ;; decsystem10* | dec10*) basic_machine=pdp10-dec os=-tops10 ;; decsystem20* | dec20*) basic_machine=pdp10-dec os=-tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) basic_machine=m68k-motorola ;; delta88) basic_machine=m88k-motorola os=-sysv3 ;; dicos) basic_machine=i686-pc os=-dicos ;; djgpp) basic_machine=i586-pc os=-msdosdjgpp ;; dpx20 | dpx20-*) basic_machine=rs6000-bull os=-bosx ;; dpx2* | dpx2*-bull) basic_machine=m68k-bull os=-sysv3 ;; e500v[12]) basic_machine=powerpc-unknown os=$os"spe" ;; e500v[12]-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` os=$os"spe" ;; ebmon29k) basic_machine=a29k-amd os=-ebmon ;; elxsi) basic_machine=elxsi-elxsi os=-bsd ;; encore | umax | mmax) basic_machine=ns32k-encore ;; es1800 | OSE68k | ose68k | ose | OSE) basic_machine=m68k-ericsson os=-ose ;; fx2800) basic_machine=i860-alliant ;; genix) basic_machine=ns32k-ns ;; gmicro) basic_machine=tron-gmicro os=-sysv ;; go32) basic_machine=i386-pc os=-go32 ;; h3050r* | hiux*) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; h8300hms) basic_machine=h8300-hitachi os=-hms ;; h8300xray) basic_machine=h8300-hitachi os=-xray ;; h8500hms) basic_machine=h8500-hitachi os=-hms ;; harris) basic_machine=m88k-harris os=-sysv3 ;; hp300-*) basic_machine=m68k-hp ;; hp300bsd) basic_machine=m68k-hp os=-bsd ;; hp300hpux) basic_machine=m68k-hp os=-hpux ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) basic_machine=m68000-hp ;; hp9k3[2-9][0-9]) basic_machine=m68k-hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) basic_machine=hppa1.1-hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) basic_machine=hppa1.1-hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; hppa-next) os=-nextstep3 ;; hppaosf) basic_machine=hppa1.1-hp os=-osf ;; hppro) basic_machine=hppa1.1-hp os=-proelf ;; i370-ibm* | ibm*) basic_machine=i370-ibm ;; i*86v32) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; i386-vsta | vsta) basic_machine=i386-unknown os=-vsta ;; iris | iris4d) basic_machine=mips-sgi case $os in -irix*) ;; *) os=-irix4 ;; esac ;; isi68 | isi) basic_machine=m68k-isi os=-sysv ;; leon-*|leon[3-9]-*) basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'` ;; m68knommu) basic_machine=m68k-unknown os=-linux ;; m68knommu-*) basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; m88k-omron*) basic_machine=m88k-omron ;; magnum | m3230) basic_machine=mips-mips os=-sysv ;; merlin) basic_machine=ns32k-utek os=-sysv ;; microblaze*) basic_machine=microblaze-xilinx ;; mingw64) basic_machine=x86_64-pc os=-mingw64 ;; mingw32) basic_machine=i686-pc os=-mingw32 ;; mingw32ce) basic_machine=arm-unknown os=-mingw32ce ;; miniframe) basic_machine=m68000-convergent ;; *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) basic_machine=m68k-atari os=-mint ;; mips3*-*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` ;; mips3*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k os=-coff ;; morphos) basic_machine=powerpc-unknown os=-morphos ;; moxiebox) basic_machine=moxie-unknown os=-moxiebox ;; msdos) basic_machine=i386-pc os=-msdos ;; ms1-*) basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) basic_machine=i686-pc os=-msys ;; mvs) basic_machine=i370-ibm os=-mvs ;; nacl) basic_machine=le32-unknown os=-nacl ;; ncr3000) basic_machine=i486-ncr os=-sysv4 ;; netbsd386) basic_machine=i386-unknown os=-netbsd ;; netwinder) basic_machine=armv4l-rebel os=-linux ;; news | news700 | news800 | news900) basic_machine=m68k-sony os=-newsos ;; news1000) basic_machine=m68030-sony os=-newsos ;; news-3600 | risc-news) basic_machine=mips-sony os=-newsos ;; necv70) basic_machine=v70-nec os=-sysv ;; next | m*-next ) basic_machine=m68k-next case $os in -nextstep* ) ;; -ns2*) os=-nextstep2 ;; *) os=-nextstep3 ;; esac ;; nh3000) basic_machine=m68k-harris os=-cxux ;; nh[45]000) basic_machine=m88k-harris os=-cxux ;; nindy960) basic_machine=i960-intel os=-nindy ;; mon960) basic_machine=i960-intel os=-mon960 ;; nonstopux) basic_machine=mips-compaq os=-nonstopux ;; np1) basic_machine=np1-gould ;; neo-tandem) basic_machine=neo-tandem ;; nse-tandem) basic_machine=nse-tandem ;; nsr-tandem) basic_machine=nsr-tandem ;; nsx-tandem) basic_machine=nsx-tandem ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf ;; openrisc | openrisc-*) basic_machine=or32-unknown ;; os400) basic_machine=powerpc-ibm os=-os400 ;; OSE68000 | ose68000) basic_machine=m68000-ericsson os=-ose ;; os68k) basic_machine=m68k-none os=-os68k ;; pa-hitachi) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; paragon) basic_machine=i860-intel os=-osf ;; parisc) basic_machine=hppa-unknown os=-linux ;; parisc-*) basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; pbd) basic_machine=sparc-tti ;; pbb) basic_machine=m68k-tti ;; pc532 | pc532-*) basic_machine=ns32k-pc532 ;; pc98) basic_machine=i386-pc ;; pc98-*) basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc ;; pentiumpro | p6 | 6x86 | athlon | athlon_*) basic_machine=i686-pc ;; pentiumii | pentium2 | pentiumiii | pentium3) basic_machine=i686-pc ;; pentium4) basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium4-*) basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould ;; power) basic_machine=power-ibm ;; ppc | ppcbe) basic_machine=powerpc-unknown ;; ppc-* | ppcbe-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppcle | powerpclittle) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64le | powerpc64little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm ;; pw32) basic_machine=i586-unknown os=-pw32 ;; rdos | rdos64) basic_machine=x86_64-pc os=-rdos ;; rdos32) basic_machine=i386-pc os=-rdos ;; rom68k) basic_machine=m68k-rom68k os=-coff ;; rm[46]00) basic_machine=mips-siemens ;; rtpc | rtpc-*) basic_machine=romp-ibm ;; s390 | s390-*) basic_machine=s390-ibm ;; s390x | s390x-*) basic_machine=s390x-ibm ;; sa29200) basic_machine=a29k-amd os=-udi ;; sb1) basic_machine=mipsisa64sb1-unknown ;; sb1el) basic_machine=mipsisa64sb1el-unknown ;; sde) basic_machine=mipsisa32-sde os=-elf ;; sei) basic_machine=mips-sei os=-seiux ;; sequent) basic_machine=i386-sequent ;; sh) basic_machine=sh-hitachi os=-hms ;; sh5el) basic_machine=sh5le-unknown ;; sh64) basic_machine=sh64-unknown ;; sparclite-wrs | simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; sps7) basic_machine=m68k-bull os=-sysv2 ;; spur) basic_machine=spur-unknown ;; st2000) basic_machine=m68k-tandem ;; stratus) basic_machine=i860-stratus os=-sysv4 ;; strongarm-* | thumb-*) basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'` ;; sun2) basic_machine=m68000-sun ;; sun2os3) basic_machine=m68000-sun os=-sunos3 ;; sun2os4) basic_machine=m68000-sun os=-sunos4 ;; sun3os3) basic_machine=m68k-sun os=-sunos3 ;; sun3os4) basic_machine=m68k-sun os=-sunos4 ;; sun4os3) basic_machine=sparc-sun os=-sunos3 ;; sun4os4) basic_machine=sparc-sun os=-sunos4 ;; sun4sol2) basic_machine=sparc-sun os=-solaris2 ;; sun3 | sun3-*) basic_machine=m68k-sun ;; sun4) basic_machine=sparc-sun ;; sun386 | sun386i | roadrunner) basic_machine=i386-sun ;; sv1) basic_machine=sv1-cray os=-unicos ;; symmetry) basic_machine=i386-sequent os=-dynix ;; t3e) basic_machine=alphaev5-cray os=-unicos ;; t90) basic_machine=t90-cray os=-unicos ;; tile*) basic_machine=$basic_machine-unknown os=-linux-gnu ;; tx39) basic_machine=mipstx39-unknown ;; tx39el) basic_machine=mipstx39el-unknown ;; toad1) basic_machine=pdp10-xkl os=-tops20 ;; tower | tower-32) basic_machine=m68k-ncr ;; tpf) basic_machine=s390x-ibm os=-tpf ;; udi29k) basic_machine=a29k-amd os=-udi ;; ultra3) basic_machine=a29k-nyu os=-sym1 ;; v810 | necv810) basic_machine=v810-nec os=-none ;; vaxv) basic_machine=vax-dec os=-sysv ;; vms) basic_machine=vax-dec os=-vms ;; vpp*|vx|vx-*) basic_machine=f301-fujitsu ;; vxworks960) basic_machine=i960-wrs os=-vxworks ;; vxworks68) basic_machine=m68k-wrs os=-vxworks ;; vxworks29k) basic_machine=a29k-wrs os=-vxworks ;; wasm32) basic_machine=wasm32-unknown ;; w65*) basic_machine=w65-wdc os=-none ;; w89k-*) basic_machine=hppa1.1-winbond os=-proelf ;; xbox) basic_machine=i686-pc os=-mingw32 ;; xps | xps100) basic_machine=xps100-honeywell ;; xscale-* | xscalee[bl]-*) basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'` ;; ymp) basic_machine=ymp-cray os=-unicos ;; z8k-*-coff) basic_machine=z8k-unknown os=-sim ;; z80-*-coff) basic_machine=z80-unknown os=-sim ;; none) basic_machine=none-none os=-none ;; # Here we handle the default manufacturer of certain CPU types. It is in # some cases the only manufacturer, in others, it is the most popular. w89k) basic_machine=hppa1.1-winbond ;; op50n) basic_machine=hppa1.1-oki ;; op60c) basic_machine=hppa1.1-oki ;; romp) basic_machine=romp-ibm ;; mmix) basic_machine=mmix-knuth ;; rs6000) basic_machine=rs6000-ibm ;; vax) basic_machine=vax-dec ;; pdp10) # there are many clones, so DEC is not a safe bet basic_machine=pdp10-unknown ;; pdp11) basic_machine=pdp11-dec ;; we32k) basic_machine=we32k-att ;; sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) basic_machine=sparc-sun ;; cydra) basic_machine=cydra-cydrome ;; orion) basic_machine=orion-highlevel ;; orion105) basic_machine=clipper-highlevel ;; mac | mpw | mac-mpw) basic_machine=m68k-apple ;; pmac | pmac-mpw) basic_machine=powerpc-apple ;; *-unknown) # Make sure to match an already-canonicalized machine name. ;; *) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; esac # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` ;; *-commodore*) basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` ;; *) ;; esac # Decode manufacturer-specific aliases for certain operating systems. if [ x"$os" != x"" ] then case $os in # First match some system type aliases # that might get confused with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux ;; -solaris1 | -solaris1.*) os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; -solaris) os=-solaris2 ;; -svr4*) os=-sysv4 ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; # First accept the basic system types. # The portable systems comes first. # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* | -cloudabi* | -sortix* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \ | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) case $basic_machine in x86-* | i*86-*) ;; *) os=-nto$os ;; esac ;; -nto-qnx*) ;; -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) os=`echo $os | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc ;; -linux*) os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) os=`echo $os | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) os=`echo $os | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition ;; -os400*) os=-os400 ;; -wince*) os=-wince ;; -osfrose*) os=-osfrose ;; -osf*) os=-osf ;; -utek*) os=-bsd ;; -dynix*) os=-bsd ;; -acis*) os=-aos ;; -atheos*) os=-atheos ;; -syllable*) os=-syllable ;; -386bsd) os=-bsd ;; -ctix* | -uts*) os=-sysv ;; -nova*) os=-rtmk-nova ;; -ns2 ) os=-nextstep2 ;; -nsk*) os=-nsk ;; # Preserve the version number of sinix5. -sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; -sinix*) os=-sysv4 ;; -tpf*) os=-tpf ;; -triton*) os=-sysv3 ;; -oss*) os=-sysv3 ;; -svr4) os=-sysv4 ;; -svr3) os=-sysv3 ;; -sysvr4) os=-sysv4 ;; # This must come after -sysvr4. -sysv*) ;; -ose*) os=-ose ;; -es1800*) os=-ose ;; -xenix) os=-xenix ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; -aros*) os=-aros ;; -zvmoe) os=-zvmoe ;; -dicos*) os=-dicos ;; -nacl*) ;; -ios) ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 exit 1 ;; esac else # Here we handle the default operating systems that come with various machines. # The value should be what the vendor currently ships out the door with their # machine or put another way, the most popular os provided with the machine. # Note that if you're going to try to match "-MANUFACTURER" here (say, # "-sun"), then you have to tell the case statement up towards the top # that MANUFACTURER isn't an operating system. Otherwise, code above # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. case $basic_machine in score-*) os=-elf ;; spu-*) os=-elf ;; *-acorn) os=-riscix1.2 ;; arm*-rebel) os=-linux ;; arm*-semi) os=-aout ;; c4x-* | tic4x-*) os=-coff ;; c8051-*) os=-elf ;; hexagon-*) os=-elf ;; tic54x-*) os=-coff ;; tic55x-*) os=-coff ;; tic6x-*) os=-coff ;; # This must come before the *-dec entry. pdp10-*) os=-tops20 ;; pdp11-*) os=-none ;; *-dec | vax-*) os=-ultrix4.2 ;; m68*-apollo) os=-domain ;; i386-sun) os=-sunos4.0.2 ;; m68000-sun) os=-sunos3 ;; m68*-cisco) os=-aout ;; mep-*) os=-elf ;; mips*-cisco) os=-elf ;; mips*-*) os=-elf ;; or32-*) os=-coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=-sysv3 ;; sparc-* | *-sun) os=-sunos4.1.1 ;; pru-*) os=-elf ;; *-be) os=-beos ;; *-haiku) os=-haiku ;; *-ibm) os=-aix ;; *-knuth) os=-mmixware ;; *-wec) os=-proelf ;; *-winbond) os=-proelf ;; *-oki) os=-proelf ;; *-hp) os=-hpux ;; *-hitachi) os=-hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) os=-sysv ;; *-cbm) os=-amigaos ;; *-dg) os=-dgux ;; *-dolphin) os=-sysv3 ;; m68k-ccur) os=-rtu ;; m88k-omron*) os=-luna ;; *-next ) os=-nextstep ;; *-sequent) os=-ptx ;; *-crds) os=-unos ;; *-ns) os=-genix ;; i370-*) os=-mvs ;; *-next) os=-nextstep3 ;; *-gould) os=-sysv ;; *-highlevel) os=-bsd ;; *-encore) os=-bsd ;; *-sgi) os=-irix ;; *-siemens) os=-sysv4 ;; *-masscomp) os=-rtu ;; f30[01]-fujitsu | f700-fujitsu) os=-uxpv ;; *-rom68k) os=-coff ;; *-*bug) os=-coff ;; *-apple) os=-macos ;; *-atari*) os=-mint ;; *) os=-none ;; esac fi # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. vendor=unknown case $basic_machine in *-unknown) case $os in -riscix*) vendor=acorn ;; -sunos*) vendor=sun ;; -cnk*|-aix*) vendor=ibm ;; -beos*) vendor=be ;; -hpux*) vendor=hp ;; -mpeix*) vendor=hp ;; -hiux*) vendor=hitachi ;; -unos*) vendor=crds ;; -dgux*) vendor=dg ;; -luna*) vendor=omron ;; -genix*) vendor=ns ;; -mvs* | -opened*) vendor=ibm ;; -os400*) vendor=ibm ;; -ptx*) vendor=sequent ;; -tpf*) vendor=ibm ;; -vxsim* | -vxworks* | -windiss*) vendor=wrs ;; -aux*) vendor=apple ;; -hms*) vendor=hitachi ;; -mpw* | -macos*) vendor=apple ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) vendor=atari ;; -vos*) vendor=stratus ;; esac basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` ;; esac echo $basic_machine$os exit # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: davs2-1.6/build/linux/configure000077500000000000000000001233321337322544400165570ustar00rootroot00000000000000#!/bin/bash if test x"$1" = x"-h" -o x"$1" = x"--help" ; then cat <> config.log } log_ok() { echo "yes" >> config.log } log_fail() { echo "no" >> config.log } log_msg() { echo "$1" >> config.log } cc_cflags() { # several non g++ compilers issue an incredibly large number of warnings on high warning levels, # suppress them by reducing the warning level rather than having to use #pragmas for arg in $*; do [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wno-maybe-uninitialized ] && arg= [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= [[ "$arg" = -l* ]] && arg= [[ "$arg" = -L* ]] && arg= if [ $compiler_style = MS ]; then [ "$arg" = -ffast-math ] && arg="-fp:fast" [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-W3 -WX" [ "$arg" = -g ] && arg=-Z7 [ "$arg" = -fomit-frame-pointer ] && arg= [ "$arg" = -s ] && arg= [ "$arg" = -fPIC ] && arg= else [ "$arg" = -ffast-math ] && arg= [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-w3 -Werror" fi [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2 [ -n "$arg" ] && echo -n "$arg " done } cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L} [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware [ "$arg" = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Werror ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wmaybe-uninitialized ] && arg= [[ "$arg" = -Qdiag-error* ]] && arg= arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib [ "$arg" = davs2.lib ] && arg=libdavs2.lib [ -n "$arg" ] && echo -n "$arg " done } cc_check() { if [ -z "$3" ]; then if [ -z "$1$2" ]; then log_check "whether $CC works" elif [ -z "$1" ]; then log_check "for $2" else log_check "for $1" fi elif [ -z "$1" ]; then if [ -z "$2" ]; then log_check "whether $CC supports $3" else log_check "whether $CC supports $3 with $2" fi else log_check "for $3 in $1"; fi rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done echo "int main (void) { $3 return 0; }" >> conftest.c if [ $compiler_style = MS ]; then cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else cc_cmd="$CC conftest.c $CFLAGS $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi if $cc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$cc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } cpp_check() { log_check "whether $3 is true" rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c if [ $compiler_style = MS ]; then cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" else cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest" fi if $cpp_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "--------------------------------------------------" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } as_check() { log_check "whether $AS supports $1" echo "$1" > conftest$AS_EXT as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" if $as_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res } rc_check() { log_check "whether $RC works" echo "$1" > conftest.rc if [ $compiler = GNU ]; then rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc" else rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" fi if $rc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$rc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.rc >> config.log log_msg "--------------------------------------------------" fi return $res } define() { echo "#define $1$([ -n "$2" ] && echo " $2" || echo " 1")" >> config.h } die() { log_msg "DIED: $@" echo "$@" exit 1 } configure_system_override() { log_check "system libdavs2 configuration" davs2_config_path="$1/davs2_config.h" if [ -e "$davs2_config_path" ]; then res=$? log_ok arg="$(grep '#define DAVS2_GPL ' $davs2_config_path | sed -e 's/#define DAVS2_GPL *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" [ "$arg" != "$gpl" ] && die "Incompatible license with system libdavs2" fi arg="$(grep '#define DAVS2_BIT_DEPTH ' $davs2_config_path | sed -e 's/#define DAVS2_BIT_DEPTH *//; s/ *$//')" if [ -n "$arg" ]; then if [ "$arg" != "$bit_depth" ]; then echo "Override output bit depth with system libdavs2 configuration" bit_depth="$arg" fi fi arg="$(grep '#define DAVS2_CHROMA_FORMAT ' $davs2_config_path | sed -e 's/#define DAVS2_CHROMA_FORMAT *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="all" || arg="${arg#DAVS2_CSP_I}" if [ "$arg" != "$chroma_format" ]; then echo "Override output chroma format with system libdavs2 configuration" chroma_format="$arg" fi fi arg="$(grep '#define DAVS2_INTERLACED ' $davs2_config_path | sed -e 's/#define DAVS2_INTERLACED *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" if [ "$arg" != "$interlaced" ]; then echo "Override interlaced encoding support with system libdavs2 configuration" interlaced="$arg" fi fi else res=$? log_fail log_msg "Failed search path was: $davs2_config_path" fi return $res } rm -f davs2_config.h config.h config.mak config.log davs2.pc davs2.def conftest* # Construct a path to the specified directory relative to the working directory relative_path() { local base="${PWD%/}" local path="$(cd "$1" >/dev/null; printf '%s/.' "${PWD%/}")" local up='' while [[ $path != "$base/"* ]]; do base="${base%/*}" up="../$up" done dirname "$up${path#"$base/"}" } SRCPATH="$(cd ../../source ; pwd)" [ "$SRCPATH" = "$(pwd)" ] && SRCPATH=. [ -n "$(echo $SRCPATH | grep ' ')" ] && die "Out of tree builds are impossible with whitespace in source path." BUILDPATH="$(cd . ; pwd)" echo "$SRCPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path." echo "$BUILDPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path." [ -e "$BUILDPATH/config.h" -o -e "$BUILDPATH/davs2_config.h" ] && die "Out of tree builds are impossible with config.h/davs2_config.h in source dir." prefix='/usr/local' exec_prefix='${prefix}' bindir='${exec_prefix}/bin' libdir='${exec_prefix}/lib' includedir='${prefix}/include' DEVNULL='/dev/null' cli="yes" cli_libdavs2="internal" shared="no" static="yes" gpl="yes" thread="auto" asm="auto" interlaced="yes" lto="no" debug="no" gprof="no" strip="no" pic="no" bit_depth="8" chroma_format="all" compiler="GNU" compiler_style="GNU" opencl="no" vsx="auto" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)" RCFLAGS="$RCFLAGS" CHECK_CFLAGS="" HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" AS_EXT=".S" NL=" " # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F \ GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ MSA MMAP WINRT VSX" # parse options for opt do optarg="${opt#*=}" case "$opt" in --prefix=*) prefix="$optarg" ;; --exec-prefix=*) exec_prefix="$optarg" ;; --bindir=*) bindir="$optarg" ;; --libdir=*) libdir="$optarg" ;; --includedir=*) includedir="$optarg" ;; --disable-cli) cli="no" ;; --system-libdavs2) cli_libdavs2="system" ;; --enable-shared) shared="yes" ;; --disable-static) static="no" ;; --disable-asm) asm="no" ;; --disable-interlaced) interlaced="no" ;; --disable-gpl) gpl="no" ;; --extra-asflags=*) ASFLAGS="$ASFLAGS $optarg" ;; --extra-cflags=*) CFLAGS="$CFLAGS $optarg" ;; --extra-ldflags=*) LDFLAGS="$LDFLAGS $optarg" ;; --extra-rcflags=*) RCFLAGS="$RCFLAGS $optarg" ;; --disable-thread) thread="no" ;; --disable-win32thread) [ "$thread" != "no" ] && thread="posix" ;; --enable-lto) lto="auto" ;; --enable-debug) debug="yes" ;; --enable-gprof) CFLAGS="$CFLAGS -pg" LDFLAGS="$LDFLAGS -pg" gprof="yes" ;; --enable-strip) strip="yes" ;; --enable-pic) pic="yes" ;; --host=*) host="$optarg" ;; --disable-vsx) vsx="no" ;; --disable-opencl) opencl="no" ;; --cross-prefix=*) cross_prefix="$optarg" ;; --sysroot=*) CFLAGS="$CFLAGS --sysroot=$optarg" LDFLAGS="$LDFLAGS --sysroot=$optarg" ;; --bit-depth=*) bit_depth="$optarg" if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then echo "Supplied bit depth must be in range [8,10]." exit 1 elif [[ "$bit_depth" = "9" || "$bit_depth" = "10" ]]; then echo "BitDepth $bit_depth not supported currently." exit 1 fi bit_depth=`expr $bit_depth + 0` ;; --chroma-format=*) chroma_format="$optarg" if [ $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then echo "Supplied chroma format must be 420, 422, 444 or all." exit 1 fi ;; *) echo "Unknown option $opt, ignored" ;; esac done [ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." CC="${CC-${cross_prefix}g++}" STRIP="${STRIP-${cross_prefix}strip}" INSTALL="${INSTALL-install}" PKGCONFIG="${PKGCONFIG-${cross_prefix}pkg-config}" # ar and ranlib doesn't load the LTO plugin by default, prefer the g++-prefixed wrappers which does. if ${cross_prefix}g++-ar --version >/dev/null 2>&1; then AR="${AR-${cross_prefix}g++-ar}" else AR="${AR-${cross_prefix}ar}" fi if ${cross_prefix}g++-ranlib --version >/dev/null 2>&1; then RANLIB="${RANLIB-${cross_prefix}g++-ranlib}" else RANLIB="${RANLIB-${cross_prefix}ranlib}" fi if [ "x$host" = x ]; then host=`${BUILDPATH}/config.guess` fi # normalize a triplet into a quadruplet host=`${BUILDPATH}/config.sub $host` # split $host host_cpu="${host%%-*}" host="${host#*-}" host_vendor="${host%%-*}" host_os="${host#*-}" trap 'rm -f conftest*' EXIT # test for use of compilers that require specific handling cc_base=`basename "$CC"` QPRE="-" if [[ $host_os = mingw* || $host_os = cygwin* ]]; then if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" compiler=ICL compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" if cc_check '' -Qdiag-error:10006,10157 ; then CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" fi elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then # Standard Microsoft Visual Studio compiler=CL compiler_style=MS CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras" `$CC 2>&1 | grep -q 'x86'` && host_cpu=i486 `$CC 2>&1 | grep -q 'x64'` && host_cpu=x86_64 cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" else # MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones. CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L" fi else if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then AR="xiar" compiler=ICC fi fi if [[ "$cc_base" = clang* ]]; then if cc_check '' -Werror=unknown-warning-option ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" fi fi libm="" case $host_os in beos*) SYS="BEOS" define HAVE_MALLOC_H ;; darwin*) SYS="MACOSX" libm="-lm" if [ "$pic" = "no" ]; then cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic" fi # TODO: Fix compiling error under mac osx (force disabled now) asm="no" ;; freebsd*) SYS="FREEBSD" libm="-lm" ;; kfreebsd*-gnu) SYS="FREEBSD" define HAVE_MALLOC_H libm="-lm" ;; netbsd*) SYS="NETBSD" libm="-lm" ;; openbsd*) SYS="OPENBSD" libm="-lm" ;; *linux*) SYS="LINUX" define HAVE_MALLOC_H libm="-lm" ;; gnu*) SYS="HURD" define HAVE_MALLOC_H libm="-lm" ;; cygwin*|mingw*|msys*) EXE=".exe" if [[ $host_os = cygwin* ]] && cpp_check "" "" "defined(__CYGWIN__)" ; then SYS="CYGWIN" define HAVE_MALLOC_H else SYS="WINDOWS" DEVNULL="NUL" LDFLAGSCLI="$LDFLAGSCLI -lshell32" [ $compiler = GNU ] && RC="${RC-${cross_prefix}windres}" || RC="${RC-rc}" fi ;; sunos*|solaris*) SYS="SunOS" define HAVE_MALLOC_H libm="-lm" if cc_check "" /usr/lib/64/values-xpg6.o; then LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o" else LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o" fi if test -x /usr/ucb/install ; then INSTALL=/usr/ucb/install elif test -x /usr/bin/ginstall ; then # OpenSolaris INSTALL=/usr/bin/ginstall elif test -x /usr/gnu/bin/install ; then # OpenSolaris INSTALL=/usr/gnu/bin/install fi HAVE_GETOPT_LONG=0 ;; *qnx*) SYS="QNX" define HAVE_MALLOC_H libm="-lm" HAVE_GETOPT_LONG=0 CFLAGS="$CFLAGS -I\$(SRCPATH)/extras" ;; *haiku*) SYS="HAIKU" ;; *) die "Unknown system $host, edit the configure" ;; esac LDFLAGS="$LDFLAGS $libm" stack_alignment=4 case $host_cpu in i*86) ARCH="X86" AS="${AS-yasm}" AS_EXT=".asm" CFLAGS="$CFLAGS -DARCH_X86_64=0" ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then CFLAGS="$CFLAGS -march=i686" fi if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then CFLAGS="$CFLAGS -mfpmath=sse -msse -msse2" fi CFLAGS="-m32 $CFLAGS" LDFLAGS="-m32 $LDFLAGS" fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho32 -DPREFIX" elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf32" fi ;; x86_64) ARCH="X86_64" AS="${AS-yasm}" AS_EXT=".asm" CFLAGS="$CFLAGS -DARCH_X86_64=1" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" stack_alignment=16 [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX" if cc_check '' "-arch x86_64"; then CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win64" if [ $compiler = GNU ]; then # only the GNU toolchain is inconsistent in prefixing function names with _ cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" cc_check "" "-Wl,--high-entropy-va" && LDFLAGS="$LDFLAGS -Wl,--high-entropy-va" LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" LDFLAGSCLI="$LDFLAGSCLI -Wl,--image-base,0x140000000" SOFLAGS="$SOFLAGS -Wl,--image-base,0x180000000" RCFLAGS="--target=pe-x86-64 $RCFLAGS" fi else ASFLAGS="$ASFLAGS -f elf64" fi ;; powerpc*) ARCH="PPC" if [ $asm = auto ] ; then define HAVE_ALTIVEC AS="${AS-${CC}}" AS_EXT=".c" if [ $SYS = MACOSX ] ; then CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4" else CFLAGS="$CFLAGS -maltivec -mabi=altivec" define HAVE_ALTIVEC_H fi if [ "$vsx" != "no" ] ; then vsx="no" if cc_check "" "-mvsx" ; then CFLAGS="$CFLAGS -mvsx" define HAVE_VSX vsx="yes" fi fi fi ;; sparc) ARCH="SPARC" ;; mips*) ARCH="MIPS" AS="${AS-${CC}}" AS_EXT=".c" ;; arm*) ARCH="ARM" if [ "$SYS" = MACOSX ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all # build for armv7 by default if ! echo $CFLAGS | grep -Eq '\-arch' ; then CFLAGS="$CFLAGS -arch armv7" LDFLAGS="$LDFLAGS -arch armv7" fi else AS="${AS-${CC}}" fi ;; aarch64) ARCH="AARCH64" stack_alignment=16 if [ "$SYS" = MACOSX ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX" else AS="${AS-${CC}}" fi ;; s390|s390x) ARCH="S390" ;; hppa*|parisc*) ARCH="PARISC" ;; ia64) ARCH="IA64" ;; alpha*) ARCH="ALPHA" ;; *) ARCH="$(echo $host_cpu | tr a-z A-Z)" ;; esac [ "$vsx" != "yes" ] && vsx="no" if [ $SYS = WINDOWS ]; then if ! rc_check "0 RCDATA {0}" ; then RC="" fi if cpp_check "winapifamily.h" "" "!WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)" ; then [ $compiler = CL ] || die "WinRT requires MSVC" define HAVE_WINRT CFLAGS="$CFLAGS -MD" LDFLAGS="$LDFLAGS -appcontainer" if ! cpp_check "" "" "defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0603" ; then die "_WIN32_WINNT must be defined to at least 0x0603 (Windows 8.1) for WinRT" elif cpp_check "" "" "_WIN32_WINNT >= 0x0A00" ; then # Universal Windows Platform (Windows 10) LDFLAGS="$LDFLAGS -lWindowsApp" fi cli="no" opencl="no" fi fi log_msg "davs2 configure script" if [ -n "$*" ]; then msg="Command line options:" for i in $@; do msg="$msg \"$i\"" done log_msg "$msg" fi log_msg "" # check requirements cc_check || die "No working C compiler found." if [ $compiler_style = GNU ]; then if cc_check '' -std=gnu++11 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=gnu++11 -D_GNU_SOURCE" elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then die "GNU++11 compiler is needed for compilation." fi fi if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then pic="yes" fi if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if cc_check '' -mpreferred-stack-boundary=5 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" stack_alignment=32 elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=4" stack_alignment=16 fi elif [ $compiler = ICC -a $ARCH = X86 ]; then # icc on linux has various degrees of mod16 stack support if [ $SYS = LINUX ]; then # >= 12 defaults to a mod16 stack if cpp_check "" "" "__INTEL_COMPILER >= 1200" ; then stack_alignment=16 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. elif cpp_check "" "" "__INTEL_COMPILER >= 1100" ; then CFLAGS="$CFLAGS -falign-stack=assume-16-byte" stack_alignment=16 fi # < 11 is completely incapable of keeping a mod16 stack fi fi if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if ! as_check "vpmovzxwd ymm0, xmm0" ; then VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1` echo "Found $VER" echo "Minimum version is yasm-1.2.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM ASFLAGS="$ASFLAGS -Worphan-labels" define HAVE_MMX fi if [ $asm = auto -a $ARCH = ARM ] ; then # set flags so neon is built by default echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon" if cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON ASFLAGS="$ASFLAGS -c" else echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a $ARCH = AARCH64 ] ; then if cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON ASFLAGS="$ASFLAGS -c" else echo "no NEON support, try adding -mfpu=neon to CFLAGS" echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then # check if the assembler supports '.func' (clang 3.5 does not) as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 fi if [ $asm = auto -a $ARCH = MIPS ] ; then if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS" fi if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then define HAVE_MSA else echo "You specified a pre-MSA CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" define ARCH_$ARCH define SYS_$SYS define STACK_ALIGNMENT $stack_alignment ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well CPU_ENDIAN="little-endian" if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then define WORDS_BIGENDIAN CPU_ENDIAN="big-endian" elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then die "endian test failed" fi fi if [ "$cli_libdavs2" = "system" -a "$shared" != "yes" ] ; then [ "$static" = "yes" ] && die "Option --system-libdavs2 can not be used together with --enable-static" if $PKGCONFIG --exists davs2 2>/dev/null; then DAVS2_LIBS="$($PKGCONFIG --libs davs2)" DAVS2_INCLUDE_DIR="${DAVS2_INCLUDE_DIR-$($PKGCONFIG --variable=includedir davs2)}" configure_system_override "$DAVS2_INCLUDE_DIR" || die "Detection of system libdavs2 configuration failed" else die "Can not find system libdavs2" fi fi # autodetect options that weren't forced nor disabled libpthread="" if [ "$SYS" = "WINDOWS" -a "$thread" = "posix" ] ; then if [ "$gpl" = "no" ] ; then echo "Warning: pthread-win32 is LGPL and is therefore not supported with --disable-gpl" thread="no" elif cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then libpthread="-lpthread" elif cc_check pthread.h -lpthreadGC2 "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2" elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lwsock32" define PTW32_STATIC_LIB elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lws2_32" define PTW32_STATIC_LIB else thread="no" fi elif [ "$thread" != "no" ] ; then thread="no" case $SYS in BEOS) thread="beos" define HAVE_BEOSTHREAD ;; WINDOWS) thread="win32" define HAVE_WIN32THREAD ;; QNX) cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc" ;; *) if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then thread="posix" libpthread="-lpthread" else cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread="" fi ;; esac fi if [ "$thread" = "posix" ]; then LDFLAGS="$LDFLAGS $libpthread" define HAVE_POSIXTHREAD if [ "$SYS" = "LINUX" ] && cc_check sched.h "-D_GNU_SOURCE -Werror" "cpu_set_t p_aff; return CPU_COUNT(&p_aff);" ; then define HAVE_CPU_COUNT fi fi [ "$thread" != "no" ] && define HAVE_THREAD if cc_check "math.h" "-Werror" "return log2f(2);" ; then define HAVE_LOG2F fi if [ "$SYS" != "WINDOWS" ] && cpp_check "sys/mman.h unistd.h" "" "defined(MAP_PRIVATE)"; then define HAVE_MMAP fi if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then define HAVE_THP fi cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT if [ "$pic" = "yes" ] ; then [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC" ASFLAGS="$ASFLAGS -DPIC" # resolve textrels in the x86 asm cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic" [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text" fi if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then CFLAGS="$CFLAGS -fomit-frame-pointer" fi if [ "$strip" = "yes" ]; then LDFLAGS="$LDFLAGS -s" fi if [ "$debug" = "yes" ]; then CFLAGS="-O1 -g $CFLAGS" RCFLAGS="$RCFLAGS -DDEBUG" else CFLAGS="-O3 -ffast-math $CFLAGS" if [ "$lto" = "auto" ] && [ $compiler = GNU ] && cc_check "" "-flto" ; then lto="yes" CFLAGS="$CFLAGS -flto" LDFLAGS="$LDFLAGS -O3 -flto" fi fi [ "$lto" = "auto" ] && lto="no" if cc_check '' -fno-tree-vectorize ; then CFLAGS="$CFLAGS -fno-tree-vectorize" fi if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then # workaround g++/ld bug with alignment of static variables/arrays that are initialized to zero cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss" fi if cc_check '' -Wshadow ; then CFLAGS="-Wshadow $CFLAGS" fi if cc_check '' -Wmaybe-uninitialized ; then if [ $SYS = MACOSX ] ; then CFLAGS="-Wno-uninitialized $CFLAGS" else CFLAGS="-Wno-maybe-uninitialized $CFLAGS" fi fi if [ $compiler = ICC -o $compiler = ICL ] ; then if cc_check 'extras/intel_dispatcher.h' '' 'davs2_intel_dispatcher_override();' ; then define HAVE_INTEL_DISPATCHER fi fi if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" CFLAGS+=" -DHIGH_BIT_DEPTH=1" opencl="no" else ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0" CFLAGS+=" -DHIGH_BIT_DEPTH=0" fi if [ "$chroma_format" != "all" ]; then define CHROMA_FORMAT CHROMA_$chroma_format fi ASFLAGS="$ASFLAGS -DBIT_DEPTH=$bit_depth" CFLAGS+=" -DBIT_DEPTH=$bit_depth" [ $gpl = yes ] && define HAVE_GPL && davs2_gpl=1 || davs2_gpl=0 [ $interlaced = yes ] && define HAVE_INTERLACED && davs2_interlaced=1 || davs2_interlaced=0 libdl="" if [ "$opencl" = "yes" ]; then opencl="no" # cygwin can use opencl if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then opencl="yes" define HAVE_OPENCL elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then opencl="yes" define HAVE_OPENCL libdl="-ldl" fi LDFLAGS="$LDFLAGS $libdl" fi #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 done # generate exported config file config_chroma_format="DAVS2_CSP_I$chroma_format" [ "$config_chroma_format" == "DAVS2_CSP_Iall" ] && config_chroma_format="0" cat > davs2_config.h << EOF #define DAVS2_BIT_DEPTH $bit_depth #define DAVS2_GPL $davs2_gpl #define DAVS2_INTERLACED $davs2_interlaced #define DAVS2_CHROMA_FORMAT $config_chroma_format EOF # generate version.h cd ${SRCPATH}/.. ./version.sh >> ${BUILDPATH}/davs2_config.h cd ${BUILDPATH} if [ "$cli_libdavs2" = "system" ] ; then if [ "$shared" = "yes" ]; then CLI_LIBDAVS2='$(SONAME)' else CLI_LIBDAVS2= LDFLAGSCLI="$DAVS2_LIBS $LDFLAGSCLI" cc_check 'stdint.h davs2.h' '' 'davs2_encoder_open(0);' || die "System libdavs2 can't be used for compilation of this version" fi else CLI_LIBDAVS2='$(LIBDAVS2)' fi DEPMM="${QPRE}MM" DEPMT="${QPRE}MT" if [ $compiler_style = MS ]; then AR="lib -nologo -out:" LD="link -out:" if [ $compiler = ICL ]; then AR="xi$AR" LD="xi$LD" else mslink="$(dirname "$(command -v cl 2>/dev/null)")/link" [ -x "$mslink" ] && LD="\"$mslink\" -out:" fi HAVE_GETOPT_LONG=0 LDFLAGS="-nologo -incremental:no $(cl_ldflags $LDFLAGS)" LDFLAGSCLI="$(cl_ldflags $LDFLAGSCLI)" LIBDAVS2=libdavs2.lib RANLIB= [ -n "$RC" ] && RCFLAGS="$RCFLAGS -nologo -I. -I\$(SRCPATH)/extras -fo" STRIP= if [ $debug = yes ]; then LDFLAGS="-debug $LDFLAGS" CFLAGS="-D_DEBUG $CFLAGS" else CFLAGS="-DNDEBUG $CFLAGS" fi else # g++/icc DEPMM="$DEPMM -g0" AR="$AR rc " LD="$CC -o " LIBDAVS2=libdavs2.a [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -o " fi [ $compiler != GNU ] && CFLAGS="$(cc_cflags $CFLAGS)" if [ $compiler = ICC -o $compiler = ICL ]; then # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." PROF_GEN_LD= PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." PROF_USE_LD= elif [ $compiler = CL ]; then # Visual Studio # _M_IX86_FP is only defined on x86 [ $ARCH = X86 ] && cpp_check '' '' '_M_IX86_FP >= 1' && define __SSE__ [ $ARCH = X86_64 ] && define __SSE__ # As long as the cli application can't link against the dll, the dll can not be pgo'd. # pgds are link flag specific and the -dll flag for creating the dll makes it unshareable with the cli PROF_GEN_CC="-GL" PROF_GEN_LD="-LTCG:PGINSTRUMENT" PROF_USE_CC="-GL" PROF_USE_LD="-LTCG:PGOPTIMIZE" else PROF_GEN_CC="-fprofile-generate" PROF_GEN_LD="-fprofile-generate" PROF_USE_CC="-fprofile-use" PROF_USE_LD="-fprofile-use" fi # generate config files cat > config.mak << EOF SRCPATH=$SRCPATH prefix=$prefix exec_prefix=$exec_prefix bindir=$bindir libdir=$libdir includedir=$includedir SYS_ARCH=$ARCH SYS=$SYS CC=$CC CFLAGS=$CFLAGS COMPILER=$compiler COMPILER_STYLE=$compiler_style DEPMM=$DEPMM DEPMT=$DEPMT LD=$LD LDFLAGS=$LDFLAGS LIBDAVS2=$LIBDAVS2 AR=$AR RANLIB=$RANLIB STRIP=$STRIP INSTALL=$INSTALL AS=$AS ASFLAGS=$ASFLAGS RC=$RC RCFLAGS=$RCFLAGS EXE=$EXE HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG DEVNULL=$DEVNULL PROF_GEN_CC=$PROF_GEN_CC PROF_GEN_LD=$PROF_GEN_LD PROF_USE_CC=$PROF_USE_CC PROF_USE_LD=$PROF_USE_LD HAVE_OPENCL=$opencl EOF if [ $compiler_style = MS ]; then echo '%.o: %.c' >> config.mak echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak fi if [ "$cli" = "yes" ]; then echo 'default: cli' >> config.mak echo 'install: install-cli' >> config.mak fi if [ "$shared" = "yes" ]; then API=$(grep '#define DAVS2_BUILD' < ${BUILDPATH}/davs2_config.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/') if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libdavs2-$API.dll" >> config.mak if [ $compiler_style = MS ]; then echo 'IMPLIBNAME=libdavs2.dll.lib' >> config.mak # GNU ld on windows defaults to exporting all global functions if there are no explicit __declspec(dllexport) declarations # MSVC link does not act similarly, so it is required to make an export definition out of davs2.h and use it at link time echo "SOFLAGS=-dll -def:davs2.def -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak echo "EXPORTS" > davs2.def # export API functions grep "^\(int\|void\|davs2_t\).*davs2" ${SRCPATH}/davs2.h | sed -e "s/.*\(davs2.*\)(.*/\1/;s/open/open_$API/g" >> davs2.def # export API variables/data. must be flagged with the DATA keyword grep "extern.*davs2" ${SRCPATH}/davs2.h | sed -e "s/.*\(davs2\w*\)\W.*/\1 DATA/;" >> davs2.def else echo 'IMPLIBNAME=libdavs2.dll.a' >> config.mak echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) $SOFLAGS" >> config.mak fi elif [ "$SYS" = "MACOSX" ]; then echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libdavs2.$API.dylib" >> config.mak echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak elif [ "$SYS" = "SunOS" ]; then echo "SOSUFFIX=so" >> config.mak echo "SONAME=libdavs2.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak else echo "SOSUFFIX=so" >> config.mak echo "SONAME=libdavs2.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak fi echo 'default: lib-shared' >> config.mak echo 'install: install-lib-shared' >> config.mak fi if [ "$static" = "yes" ]; then echo 'default: lib-static' >> config.mak echo 'install: install-lib-static' >> config.mak fi echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak echo "CLI_LIBDAVS2 = $CLI_LIBDAVS2" >> config.mak cat > davs2.pc << EOF prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir includedir=$includedir Name: davs2 Description: AVS2 (IEEE 1857.4) decoder library Version: $(grep POINTVER < davs2_config.h | sed -e 's/.* "//; s/".*//') Libs: -L$libdir -ldavs2 $([ "$shared" = "yes" ] || echo $libpthread $libm $libdl) Libs.private: $([ "$shared" = "yes" ] && echo $libpthread $libm $libdl) Cflags: -I$includedir EOF filters="crop select_every" gpl_filters="" [ $gpl = yes ] && filters="$filters $gpl_filters" cat > conftest.log <> config.log cat conftest.log >> config.log cat conftest.log # [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile mkdir -p common/{aarch64,arm,ppc,x86,vec} test echo echo "You can run 'make' or 'make fprofiled' now." davs2-1.6/build/vs2013/000077500000000000000000000000001337322544400144435ustar00rootroot00000000000000davs2-1.6/build/vs2013/DAVS2.sln000066400000000000000000000113651337322544400160060ustar00rootroot00000000000000 Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 2013 VisualStudioVersion = 12.0.40629.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "davs2", "davs2.vcxproj", "{852EFB9B-4E73-4E80-AA57-711ADCB132AE}" ProjectSection(ProjectDependencies) = postProject {34C0840A-BDE6-446B-B0DF-A8281A42825B} = {34C0840A-BDE6-446B-B0DF-A8281A42825B} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libdavs2", "libdavs2.vcxproj", "{34C0840A-BDE6-446B-B0DF-A8281A42825B}" ProjectSection(ProjectDependencies) = postProject {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} = {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} {558555B9-A7B2-42D6-A298-BB5CC248541F} = {558555B9-A7B2-42D6-A298-BB5CC248541F} {2E7A6EE4-927F-470A-A012-3B29EDB87906} = {2E7A6EE4-927F-470A-A012-3B29EDB87906} EndProjectSection EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libdavs2_asm", "libdavs2_asm.vcxproj", "{A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libdavs2_intrin_avx", "libdavs2_intrin_avx.vcxproj", "{558555B9-A7B2-42D6-A298-BB5CC248541F}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libdavs2_intrin_sse", "libdavs2_intrin_sse.vcxproj", "{2E7A6EE4-927F-470A-A012-3B29EDB87906}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 Debug|x64 = Debug|x64 Release|Win32 = Release|Win32 Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Debug|Win32.ActiveCfg = Debug|Win32 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Debug|Win32.Build.0 = Debug|Win32 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Debug|x64.ActiveCfg = Debug|x64 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Debug|x64.Build.0 = Debug|x64 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Release|Win32.ActiveCfg = Release|Win32 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Release|Win32.Build.0 = Release|Win32 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Release|x64.ActiveCfg = Release|x64 {852EFB9B-4E73-4E80-AA57-711ADCB132AE}.Release|x64.Build.0 = Release|x64 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Debug|Win32.ActiveCfg = Debug|Win32 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Debug|Win32.Build.0 = Debug|Win32 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Debug|x64.ActiveCfg = Debug|x64 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Debug|x64.Build.0 = Debug|x64 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Release|Win32.ActiveCfg = Release|Win32 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Release|Win32.Build.0 = Release|Win32 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Release|x64.ActiveCfg = Release|x64 {34C0840A-BDE6-446B-B0DF-A8281A42825B}.Release|x64.Build.0 = Release|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|Win32.ActiveCfg = Debug|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|Win32.Build.0 = Debug|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|x64.ActiveCfg = Debug|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Debug|x64.Build.0 = Debug|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|Win32.ActiveCfg = Release|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|Win32.Build.0 = Release|Win32 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|x64.ActiveCfg = Release|x64 {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1}.Release|x64.Build.0 = Release|x64 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Debug|Win32.ActiveCfg = Debug|Win32 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Debug|Win32.Build.0 = Debug|Win32 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Debug|x64.ActiveCfg = Debug|x64 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Debug|x64.Build.0 = Debug|x64 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Release|Win32.ActiveCfg = Release|Win32 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Release|Win32.Build.0 = Release|Win32 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Release|x64.ActiveCfg = Release|x64 {558555B9-A7B2-42D6-A298-BB5CC248541F}.Release|x64.Build.0 = Release|x64 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Debug|Win32.ActiveCfg = Debug|Win32 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Debug|Win32.Build.0 = Debug|Win32 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Debug|x64.ActiveCfg = Debug|x64 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Debug|x64.Build.0 = Debug|x64 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Release|Win32.ActiveCfg = Release|Win32 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Release|Win32.Build.0 = Release|Win32 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Release|x64.ActiveCfg = Release|x64 {2E7A6EE4-927F-470A-A012-3B29EDB87906}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection EndGlobal davs2-1.6/build/vs2013/davs2.vcxproj000066400000000000000000000252371337322544400171100ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {852EFB9B-4E73-4E80-AA57-711ADCB132AE} Win32Proj Application true v120 MultiByte Application true v120 MultiByte Application false v120 true MultiByte Application false v120 true MultiByte true $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ true $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ false $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ false $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ Level4 Disabled HIGH_BIT_DEPTH=0;WIN32;ARCH_X86_64=0;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;..\..\source\test\getopt; MultiThreadedDebug Console true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\ true Level4 Disabled HIGH_BIT_DEPTH=0;WIN32;ARCH_X86_64=1;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;..\..\source\test\getopt; MultiThreadedDebug Console true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\ Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;WIN32;ARCH_X86_64=0;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;..\..\source\test\getopt; Speed MultiThreaded Console true true true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\ true Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;WIN32;ARCH_X86_64=1;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;..\..\source\test\getopt; Speed MultiThreaded Console true true true kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) $(SolutionDir)$(Platform)\ AsInvoker {8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942} false davs2-1.6/build/vs2013/davs2.vcxproj.filters000066400000000000000000000027731337322544400205570ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;hm;inl;inc;xsd {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx inc inc inc inc inc inc inc src src davs2-1.6/build/vs2013/libdavs2.vcxproj000066400000000000000000000340471337322544400175760ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {34C0840A-BDE6-446B-B0DF-A8281A42825B} Win32Proj DynamicLibrary true v120 MultiByte DynamicLibrary true v120 MultiByte DynamicLibrary false v120 true MultiByte DynamicLibrary false v120 true MultiByte true $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ true $(SolutionDir)$(Platform)\ false $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(Platform)\$(Configuration)\$(ProjectName)\ false $(SolutionDir)$(Platform)\ Level4 Disabled DAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread MultiThreadedDebug Windows true $(SolutionDir)$(Platform)\ libdavs2_asm.lib;libdavs2_intrin_sse.lib;libdavs2_intrin_avx.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) true cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 Disabled DAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; MultiThreadedDebug Windows true $(SolutionDir)$(Platform)\;$(CUDA_PATH)\lib\$(Platform);$(INTEL_OPENCL_SDK)\lib\$(Platform);$(AMD_APPSDK_PATH)\lib\x64;%(AdditionalLibraryDirectories); libdavs2_asm.lib;libdavs2_intrin_sse.lib;libdavs2_intrin_avx.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 MaxSpeed true true DAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread Speed MultiThreaded Windows true true true $(SolutionDir)$(Platform)\ libdavs2_asm.lib;libdavs2_intrin_sse.lib;libdavs2_intrin_avx.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) true cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo Level4 MaxSpeed true true DAVS2_EXPORTS;HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; Speed MultiThreaded Windows true true true $(SolutionDir)$(Platform)\;$(CUDA_PATH)\lib\$(Platform);$(INTEL_OPENCL_SDK)\lib\$(Platform);$(AMD_APPSDK_PATH)\lib\x64;%(AdditionalLibraryDirectories); libdavs2_asm.lib;libdavs2_intrin_sse.lib;libdavs2_intrin_avx.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) cd /d "$(SolutionDir)..\.." && sh version.sh UpdateSourceVersionInfo davs2-1.6/build/vs2013/libdavs2.vcxproj.filters000066400000000000000000000137121337322544400212410ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;hm;inl;inc;xsd {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms src src src src src src src src src src src src src src src src src src src src src src src inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc inc davs2-1.6/build/vs2013/libdavs2_asm.vcxproj000066400000000000000000000244401337322544400204320ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 true true true true {A9B37E3C-A8C7-4E24-BC2D-AB4D0804DAC1} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread ProgramDatabase MultiThreadedDebug 4752; Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; MultiThreadedDebug 4752; Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread Speed MultiThreaded 4752; Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; Speed MultiThreaded 4752; Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; davs2-1.6/build/vs2013/libdavs2_asm.vcxproj.filters000066400000000000000000000027141337322544400221010ustar00rootroot00000000000000 {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 asm-x86 davs2-1.6/build/vs2013/libdavs2_intrin_avx.vcxproj000066400000000000000000000242371337322544400220370ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {558555B9-A7B2-42D6-A298-BB5CC248541F} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread ProgramDatabase MultiThreadedDebug AdvancedVectorExtensions2 Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; MultiThreadedDebug AdvancedVectorExtensions2 Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread Speed MultiThreaded AdvancedVectorExtensions2 Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; Speed MultiThreaded AdvancedVectorExtensions Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; davs2-1.6/build/vs2013/libdavs2_intrin_avx.vcxproj.filters000066400000000000000000000023771337322544400235070ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx vec vec vec vec vec vec vec davs2-1.6/build/vs2013/libdavs2_intrin_sse.vcxproj000066400000000000000000000240441337322544400220270ustar00rootroot00000000000000 Debug Win32 Debug x64 Release Win32 Release x64 {2E7A6EE4-927F-470A-A012-3B29EDB87906} Win32Proj asmopt StaticLibrary true v120 MultiByte StaticLibrary true v120 MultiByte StaticLibrary false v120 true MultiByte StaticLibrary false v120 true MultiByte $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ $(SolutionDir)$(Platform)\ $(Platform)\$(Configuration)\$(ProjectName)\ Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread ProgramDatabase MultiThreadedDebug 4752; Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 Disabled HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;_DEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; MultiThreadedDebug 4752; Windows true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=0;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread Speed MultiThreaded 4752; Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=0;PREFIX Level4 MaxSpeed true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;WIN32;ARCH_X86_64=1;NDEBUG;_LIB;%(PreprocessorDefinitions) ..\..\;..\..\source;..\..\pthread;$(CUDA_PATH)\include;$(AMD_APPSDK_PATH)\include;$(INTEL_OPENCL_SDK)\include; Speed MultiThreaded 4752; Windows true true true HIGH_BIT_DEPTH=0;BIT_DEPTH=8;ARCH_X86_64=1; davs2-1.6/build/vs2013/libdavs2_intrin_sse.vcxproj.filters000066400000000000000000000032671337322544400235020ustar00rootroot00000000000000 {93995380-89BD-4b04-88EB-625FBE52EBFB} cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx vec vec vec vec vec vec vec vec vec vec vec davs2-1.6/source/000077500000000000000000000000001337322544400137065ustar00rootroot00000000000000davs2-1.6/source/common/000077500000000000000000000000001337322544400151765ustar00rootroot00000000000000davs2-1.6/source/common/aec.cc000066400000000000000000002301621337322544400162410ustar00rootroot00000000000000/* * aec.cc * * Description of this file: * AEC functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "alf.h" #include "aec.h" #include "vlc.h" #include "sao.h" #include "scantab.h" /** * =========================================================================== * macros * =========================================================================== */ #define CTRL_OPT_AEC 1 /* ǷûڲAEC״̬ */ #define MAKE_CONTEXT(lg_pmps, mps, cycno) (((uint16_t)(cycno) << 0) | ((uint16_t)(mps) << 2) | (uint16_t)(lg_pmps << 3)) /** * =========================================================================== * global & local variables * =========================================================================== */ #if AVS2_TRACE int symbolCount = 0; #endif #if CTRL_OPT_AEC /* [8 * lg_pmps + 4 * mps + cycno] */ static context_t g_tab_ctx_mps[2048 * 4 * 2]; static context_t g_tab_ctx_lps[2048 * 4 * 2]; #endif /* --------------------------------------------------------------------------- * 0: INTRA_PRED_VER * 1: INTRA_PRED_HOR * 2: INTRA_PRED_DC_DIAG */ const int tab_intra_mode_scan_type[NUM_INTRA_MODE] = { 2, 2, 2, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 0 }; static const int EO_OFFSET_INV__MAP[] = { 1, 0, 2, -1, 3, 4, 5, 6 }; static const int T_Chr[5] = { 0, 1, 2, 4, 3000 }; static const int8_t tab_rank[6] = { 0, 1, 2, 3, 3, 4/*, 4 ...*/ }; static const uint8_t raster2ZZ_4x4[] = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; static const uint8_t raster2ZZ_8x8[] = { 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63 }; static const uint8_t raster2ZZ_2x8[] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; static const uint8_t raster2ZZ_8x2[] = { 0, 1, 2, 4, 3, 5, 6, 8, 7, 9, 10, 12, 11, 13, 14, 15 }; static const uint8_t tab_scan_coeff_pos_in_cg[4][4] = { { 0, 1, 5, 6 }, { 2, 4, 7, 12 }, { 3, 8, 11, 13 }, { 9, 10, 14, 15 } }; static const uint8_t tab_cwr[] = { 3, 3, 4, 5, 5, 5, 5 /* 5, 5, 5, 5 */ }; static const uint16_t tab_lg_pmps_offset[] = { 0, 0, 0, 197, 95, 46 /* 5, 5, 5, 5 */ }; static const int tab_pdir_bskip[DS_MAX_NUM] = { PDIR_SYM, PDIR_BID, PDIR_BWD, PDIR_SYM, PDIR_FWD }; /** * =========================================================================== * defines * =========================================================================== */ enum aec_const_e { LG_PMPS_SHIFTNO = 2, B_BITS = 10, QUARTER_SHIFT = (B_BITS-2), HALF = (1 << (B_BITS-1)), QUARTER = (1 << (B_BITS-2)), AEC_VALUE_BOUND = 254, /* make sure rs1 will not overflow for 8-bit uint8_t */ }; static const int8_t tab_intra_mode_luma2chroma[NUM_INTRA_MODE] = { DC_PRED_C, -1, BI_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1, -1, VERT_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, HOR_PRED_C, -1, -1, -1, -1, -1, -1, -1, -1 }; /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_next_bit(aec_t *p_aec) { uint32_t next_bit; if (--p_aec->i_bits_to_go < 0) { int diff = p_aec->i_bytes - p_aec->i_byte_pos; uint8_t *p_buffer = p_aec->p_buffer + p_aec->i_byte_pos; #if 1 if (diff > 7) { p_aec->i_byte_buf = ((uint64_t)p_buffer[0] << 56) | ((uint64_t)p_buffer[1] << 48) | ((uint64_t)p_buffer[2] << 40) | ((uint64_t)p_buffer[3] << 32) | ((uint64_t)p_buffer[4] << 24) | ((uint64_t)p_buffer[5] << 16) | ((uint64_t)p_buffer[6] << 8) | (uint64_t)p_buffer[7]; p_aec->i_bits_to_go = 63; p_aec->i_byte_pos += 8; } else if (diff > 0) { /* һ֡ʣС8һ֡ͼֻһ */ int i; p_aec->i_bits_to_go += (int8_t)(diff << 3); p_aec->i_byte_pos += (p_aec->i_bits_to_go + 1) >> 3; p_aec->i_byte_buf = 0; for (i = 0; i < diff; i++) { p_aec->i_byte_buf = (p_aec->i_byte_buf << 8) | p_buffer[i]; } } else { p_aec->b_bit_error = 1; return 1; } #else int i; if (diff > 8) { diff = 8; } else if (diff <= 0) { p_aec->b_bit_error = 1; return 1; } p_aec->i_bits_to_go += (diff << 3); p_aec->i_byte_pos += (p_aec->i_bits_to_go + 1) >> 3; p_aec->i_byte_buf = 0; for (i = 0; i < diff; i++) { p_aec->i_byte_buf = (p_aec->i_byte_buf << 8) | p_buffer[i]; } #endif } /* get next bit */ next_bit = ((p_aec->i_byte_buf >> p_aec->i_bits_to_go) & 0x01); p_aec->i_value_t = (p_aec->i_value_t << 1) | next_bit; return 0; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int aec_get_next_n_bit(aec_t *p_aec, int num_bits) { if (p_aec->i_bits_to_go >= num_bits) { uint32_t next_bits; p_aec->i_bits_to_go -= (int8_t)num_bits; next_bits = (p_aec->i_byte_buf >> p_aec->i_bits_to_go) & ((1 << num_bits) - 1); p_aec->i_value_t = (p_aec->i_value_t << num_bits) | next_bits; return 0; } else { for (; num_bits != 0; num_bits--) { aec_get_next_bit(p_aec); } return p_aec->b_bit_error; } } /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void update_ctx_mps(context_t *ctx) { #if CTRL_OPT_AEC ctx->v = g_tab_ctx_mps[ctx->v].v; #else uint32_t lg_pmps = ctx->LG_PMPS; uint8_t cycno = (uint8_t)ctx->cycno; uint32_t cwr = tab_cwr[cycno]; // update probability estimation and other parameters if (cycno == 0) { ctx->cycno = 1; } lg_pmps -= (lg_pmps >> cwr) + (lg_pmps >> (cwr + 2)); ctx->LG_PMPS = (uint16_t)lg_pmps; #endif } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void update_ctx_lps(context_t *ctx) { #if CTRL_OPT_AEC ctx->v = g_tab_ctx_lps[ctx->v].v; #else uint32_t cycno = ctx->cycno; uint32_t cwr = tab_cwr[cycno]; uint32_t lg_pmps = ctx->LG_PMPS + tab_lg_pmps_offset[cwr]; uint32_t mps = ctx->MPS; // update probability estimation and other parameters if (cycno != 3) { ++cycno; } if (lg_pmps >= (256 << LG_PMPS_SHIFTNO)) { lg_pmps = (512 << LG_PMPS_SHIFTNO) - 1 - lg_pmps; mps = !mps; } ctx->v = MAKE_CONTEXT(lg_pmps, mps, cycno); #endif } #if CTRL_OPT_AEC /* --------------------------------------------------------------------------- */ void init_aec_context_tab(void) { static bool_t b_inited = 0; context_t ctx_i; context_t ctx_o; int cycno; int mps; if (b_inited != 0) { return; } /* init context table */ b_inited = 1; ctx_i.v = 0; ctx_o.v = 0; memset(g_tab_ctx_mps, 0, sizeof(g_tab_ctx_mps)); memset(g_tab_ctx_lps, 0, sizeof(g_tab_ctx_lps)); /* mps */ for (cycno = 0; cycno < 4; cycno++) { uint32_t cwr = tab_cwr[cycno]; ctx_i.cycno = cycno; ctx_o.cycno = (uint8_t)DAVS2_MAX(cycno, 1); for (mps = 0; mps < 2; mps++) { ctx_i.MPS = (uint8_t)mps; ctx_o.MPS = (uint8_t)mps; for (ctx_i.LG_PMPS = 0; ctx_i.LG_PMPS <= 1024; ctx_i.LG_PMPS++) { uint32_t lg_pmps = ctx_i.LG_PMPS; lg_pmps -= (lg_pmps >> cwr) + (lg_pmps >> (cwr + 2)); ctx_o.LG_PMPS = (uint16_t)lg_pmps; g_tab_ctx_mps[ctx_i.v].v = ctx_o.v; } } } /* lps */ for (cycno = 0; cycno < 4; cycno++) { uint32_t cwr = tab_cwr[cycno]; ctx_i.cycno = cycno; ctx_o.cycno = (uint8_t)DAVS2_MIN(cycno + 1, 3); for (mps = 0; mps < 2; mps++) { ctx_i.MPS = (uint8_t)mps; ctx_o.MPS = (uint8_t)mps; for (ctx_i.LG_PMPS = 0; ctx_i.LG_PMPS <= 1024; ctx_i.LG_PMPS++) { uint32_t lg_pmps = ctx_i.LG_PMPS + tab_lg_pmps_offset[cwr]; if (lg_pmps >= (256 << LG_PMPS_SHIFTNO)) { lg_pmps = (512 << LG_PMPS_SHIFTNO) - 1 - lg_pmps; ctx_o.MPS = !mps; } ctx_o.LG_PMPS = (uint16_t)lg_pmps; g_tab_ctx_lps[ctx_i.v].v = ctx_o.v; } } } } #endif /* --------------------------------------------------------------------------- * initializes the aec_t for the arithmetic decoder */ int aec_start_decoding(aec_t *p_aec, uint8_t *p_start, int i_byte_pos, int i_bytes) { #if CTRL_OPT_AEC init_aec_context_tab(); #endif p_aec->p_buffer = p_start; p_aec->i_byte_pos = i_byte_pos; p_aec->i_bytes = i_bytes; p_aec->i_bits_to_go = 0; p_aec->b_bit_error = 0; p_aec->b_val_domain = 1; p_aec->i_s1 = 0; p_aec->i_t1 = QUARTER - 1; // 0xff p_aec->i_value_s = 0; p_aec->i_value_t = 0; if (p_aec->i_bits_to_go < B_BITS - 1) { if (aec_get_next_n_bit(p_aec, B_BITS - 1)) { return 0; } } return 0; } /* --------------------------------------------------------------------------- */ int aec_bits_read(aec_t *p_aec) { return (p_aec->i_byte_pos << 3) - p_aec->i_bits_to_go; } /* --------------------------------------------------------------------------- */ static INLINE int biari_decode_symbol(aec_t *p_aec, context_t *ctx) { uint32_t lg_pmps = ctx->LG_PMPS >> LG_PMPS_SHIFTNO; uint32_t t2; uint32_t s2; uint32_t s_flag; uint32_t i_value_s = p_aec->i_value_s; int bit = ctx->MPS; int is_LPS; // p_aec->i_value_t is in R domain p_aec->i_s1=0 or p_aec->i_s1 == AEC_VALUE_BOUND if (p_aec->b_val_domain != 0 || (p_aec->i_s1 == AEC_VALUE_BOUND && p_aec->b_val_bound != 0)) { i_value_s = 0; p_aec->i_s1 = 0; while (p_aec->i_value_t < QUARTER && i_value_s < AEC_VALUE_BOUND) { if (aec_get_next_bit(p_aec)) { return 0; } i_value_s++; } p_aec->b_val_bound = p_aec->i_value_t < QUARTER; p_aec->i_value_t = p_aec->i_value_t & 0xff; } if (p_aec->i_value_s > AEC_VALUE_BOUND) { /// davs2_log(NULL, DAVS2_LOG_ERROR, "p_aec->i_value_s (>254)."); p_aec->b_bit_error = 1; p_aec->i_value_s = i_value_s; return 0; } s_flag = p_aec->i_t1 < lg_pmps; s2 = p_aec->i_s1 + s_flag; t2 = p_aec->i_t1 - lg_pmps + (s_flag << 8); // 8bits is_LPS = (s2 > i_value_s || (s2 == i_value_s && p_aec->i_value_t >= t2)) && p_aec->b_val_bound == 0; p_aec->b_val_domain = (bool_t)is_LPS; if (is_LPS) { // LPS uint32_t t_rlps = (s_flag == 0) ? (lg_pmps) : (p_aec->i_t1 + lg_pmps); int n_bits = 0; bit = !bit; if (s2 == i_value_s) { p_aec->i_value_t -= t2; } else { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_t += 256 - t2; } // restore range while (t_rlps < QUARTER) { t_rlps <<= 1; n_bits++; } if (n_bits) { if (aec_get_next_n_bit(p_aec, n_bits)) { return 0; } } p_aec->i_s1 = 0; p_aec->i_t1 = t_rlps & 0xff; update_ctx_lps(ctx); } else { // MPS p_aec->i_s1 = s2; p_aec->i_t1 = t2; update_ctx_mps(ctx); } p_aec->i_value_s = i_value_s; return bit; } /* --------------------------------------------------------------------------- * return the decoded symbol */ static INLINE int biari_decode_symbol_eq_prob(aec_t *p_aec) { if (p_aec->b_val_domain != 0 || (p_aec->i_s1 == AEC_VALUE_BOUND && p_aec->b_val_bound != 0)) { p_aec->i_s1 = 0; if (aec_get_next_bit(p_aec)) { return 0; } if (p_aec->i_value_t >= (256 + p_aec->i_t1)) { // LPS p_aec->i_value_t -= (256 + p_aec->i_t1); return 1; } else { return 0; } } else { uint32_t s2 = p_aec->i_s1 + 1; uint32_t t2 = p_aec->i_t1; int is_LPS = s2 > p_aec->i_value_s || (s2 == p_aec->i_value_s && p_aec->i_value_t >= t2) && p_aec->b_val_bound == 0; p_aec->b_val_domain = (bool_t)is_LPS; if (is_LPS) { //LPS if (s2 == p_aec->i_value_s) { p_aec->i_value_t -= t2; } else { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_t += 256 - t2; } return 1; } else { p_aec->i_s1 = s2; p_aec->i_t1 = t2; return 0; } } } /* --------------------------------------------------------------------------- */ static INLINE int biari_decode_final(aec_t *p_aec) { // static context_t ctx = { (1 << LG_PMPS_SHIFTNO), 0, 0 }; const uint32_t lg_pmps = 1; // ctx.LG_PMPS >> LG_PMPS_SHIFTNO; uint32_t t2; uint32_t s2; uint32_t s_flag; int is_LPS; // p_aec->i_value_t is in R domain p_aec->i_s1=0 or p_aec->i_s1 == AEC_VALUE_BOUND if (p_aec->b_val_domain != 0 || (p_aec->i_s1 == AEC_VALUE_BOUND && p_aec->b_val_bound != 0)) { p_aec->i_s1 = 0; p_aec->i_value_s = 0; while (p_aec->i_value_t < QUARTER && p_aec->i_value_s < AEC_VALUE_BOUND) { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_s++; } p_aec->b_val_bound = p_aec->i_value_t < QUARTER; p_aec->i_value_t = p_aec->i_value_t & 0xff; } s_flag = p_aec->i_t1 < lg_pmps; s2 = p_aec->i_s1 + s_flag; t2 = p_aec->i_t1 - lg_pmps + (s_flag << 8); // 8bits /* ֵ */ is_LPS = (s2 > p_aec->i_value_s || (s2 == p_aec->i_value_s && p_aec->i_value_t >= t2)) && p_aec->b_val_bound == 0; p_aec->b_val_domain = (bool_t)is_LPS; if (is_LPS) { // LPS uint32_t t_rlps = 1; int n_bits = 0; if (s2 == p_aec->i_value_s) { p_aec->i_value_t -= t2; } else { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_t += 256 - t2; } // restore range while (t_rlps < QUARTER) { t_rlps <<= 1; n_bits++; } if (n_bits) { if (aec_get_next_n_bit(p_aec, n_bits)) { return 0; } } p_aec->i_s1 = 0; p_aec->i_t1 = 0; // return 1; // !ctx.MPS } else { // MPS p_aec->i_s1 = s2; p_aec->i_t1 = t2; // return 0; // ctx.MPS } return is_LPS; } /* --------------------------------------------------------------------------- * decode symbols until a zero bit is obtained or passed max_num symbols * ʹͬĽţֱ0ߴﵽ(max_num) */ static INLINE int biari_decode_symbol_continue0(aec_t *p_aec, context_t *ctx, int max_num) { uint32_t i_value_s = p_aec->i_value_s; int bit = 0; int i; for (i = 0; i < max_num && !bit; i++) { uint32_t lg_pmps = ctx->LG_PMPS >> LG_PMPS_SHIFTNO; uint32_t t2; uint32_t s2; uint32_t s_flag; int is_LPS; bit = ctx->MPS; if (p_aec->b_val_domain != 0 || (p_aec->i_s1 == AEC_VALUE_BOUND && p_aec->b_val_bound != 0)) { p_aec->i_s1 = 0; i_value_s = 0; while (p_aec->i_value_t < QUARTER && i_value_s < AEC_VALUE_BOUND) { if (aec_get_next_bit(p_aec)) { return 0; } i_value_s++; } p_aec->b_val_bound = p_aec->i_value_t < QUARTER; p_aec->i_value_t = p_aec->i_value_t & 0xff; } s_flag = p_aec->i_t1 < lg_pmps; s2 = p_aec->i_s1 + s_flag; t2 = p_aec->i_t1 - lg_pmps + (s_flag << 8); // 8bits if (i_value_s > AEC_VALUE_BOUND) { /// davs2_log(NULL, DAVS2_LOG_ERROR, "i_value_s (>254)."); p_aec->b_bit_error = 1; return 0; } is_LPS = (s2 > i_value_s || (s2 == i_value_s && p_aec->i_value_t >= t2)) && p_aec->b_val_bound == 0; p_aec->b_val_domain = (bool_t)is_LPS; if (is_LPS) { // LPS uint32_t t_rlps = (s_flag == 0) ? (lg_pmps) : (p_aec->i_t1 + lg_pmps); int n_bits = 0; bit = !bit; if (s2 == i_value_s) { p_aec->i_value_t -= t2; } else { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_t += 256 - t2; } // restore range while (t_rlps < QUARTER) { t_rlps <<= 1; n_bits++; } if (n_bits) { if (aec_get_next_n_bit(p_aec, n_bits)) { return 0; } } p_aec->i_s1 = 0; p_aec->i_t1 = t_rlps & 0xff; update_ctx_lps(ctx); } else { // MPS p_aec->i_s1 = s2; p_aec->i_t1 = t2; update_ctx_mps(ctx); } } p_aec->i_value_s = i_value_s; return i - bit; } /* --------------------------------------------------------------------------- */ static int biari_decode_symbol_continu0_ext(aec_t *p_aec, context_t *ctx, int max_ctx_inc, int max_num) { int bit = 0; int i; for (i = 0; i < max_num && !bit; i++) { int ctx_add = DAVS2_MIN(i, max_ctx_inc); context_t *p_ctx = ctx + ctx_add; uint32_t lg_pmps = p_ctx->LG_PMPS >> LG_PMPS_SHIFTNO; uint32_t t2; uint32_t s2; int is_LPS; int s_flag; bit = p_ctx->MPS; if (p_aec->b_val_domain != 0 || (p_aec->i_s1 == AEC_VALUE_BOUND && p_aec->b_val_bound != 0)) { p_aec->i_s1 = 0; p_aec->i_value_s = 0; while (p_aec->i_value_t < QUARTER && p_aec->i_value_s < AEC_VALUE_BOUND) { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_s++; } p_aec->b_val_bound = p_aec->i_value_t < QUARTER; p_aec->i_value_t = p_aec->i_value_t & 0xff; } s_flag = p_aec->i_t1 < lg_pmps; s2 = p_aec->i_s1 + s_flag; t2 = p_aec->i_t1 - lg_pmps + (s_flag << 8); // 8bits if (p_aec->i_value_s > AEC_VALUE_BOUND) { /// davs2_log(NULL, DAVS2_LOG_ERROR, "p_aec->i_value_s (>254)."); /// exit(1); p_aec->b_bit_error = 1; return 0; } is_LPS = (s2 > p_aec->i_value_s || (s2 == p_aec->i_value_s && p_aec->i_value_t >= t2)) && p_aec->b_val_bound == 0; p_aec->b_val_domain = (bool_t)is_LPS; if (is_LPS) { // LPS uint32_t t_rlps = (s_flag == 0) ? (lg_pmps) : (p_aec->i_t1 + lg_pmps); bit = !bit; if (s2 == p_aec->i_value_s) { p_aec->i_value_t -= t2; } else { if (aec_get_next_bit(p_aec)) { return 0; } p_aec->i_value_t += 256 - t2; } // restore range while (t_rlps < QUARTER) { t_rlps <<= 1; if (aec_get_next_bit(p_aec)) { return 0; } } p_aec->i_s1 = 0; p_aec->i_t1 = t_rlps & 0xff; update_ctx_lps(p_ctx); } else { // MPS p_aec->i_s1 = s2; p_aec->i_t1 = t2; update_ctx_mps(p_ctx); } } return i - bit; } /* --------------------------------------------------------------------------- * decoding of unary binarization using one or 2 distinct models for the first * and all remaining bins; no terminating "0" for max_symbol */ static int unary_bin_max_decode(aec_t *p_aec, context_t *ctx, int ctx_offset, int max_symbol) { int symbol = biari_decode_symbol(p_aec, ctx); if (symbol == 1) { return 0; } else { if (max_symbol == 1) { return symbol; } else { context_t *p_ctx = ctx + ctx_offset; symbol = 1 + biari_decode_symbol_continue0(p_aec, p_ctx, max_symbol - 1); return symbol; } } } /* --------------------------------------------------------------------------- */ void aec_init_contexts(aec_t *p_aec) { const uint16_t lg_pmps = ((QUARTER << LG_PMPS_SHIFTNO) - 1); uint16_t v = MAKE_CONTEXT(lg_pmps, 0, 0); uint16_t *d = (uint16_t *)&p_aec->syn_ctx; int ctx_cnt = sizeof(context_set_t) / sizeof(uint16_t); while (ctx_cnt-- != 0) { *d++ = v; } } /* --------------------------------------------------------------------------- */ void aec_new_slice(davs2_t *h) { h->i_last_dquant = 0; } /* --------------------------------------------------------------------------- */ int aec_read_dmh_mode(aec_t *p_aec, int i_cu_level) { context_t *p_ctx = p_aec->syn_ctx.pu_type_index + (i_cu_level - 3) * 3 + NUM_INTER_DIR_DHP_CTX; assert(NUM_INTER_DIR_DHP_CTX + NUM_DMH_MODE_CTX == NUM_INTER_DIR_CTX); if (biari_decode_symbol(p_aec, p_ctx) == 0) { return 0; } else { if (biari_decode_symbol(p_aec, p_ctx + 1) == 0) { return 3 + biari_decode_symbol_eq_prob(p_aec); // 3, 4: ԪŴ10x } else { if (biari_decode_symbol(p_aec, p_ctx + 2) == 0) { return 7 + biari_decode_symbol_eq_prob(p_aec); // 7, 8: ԪŴ110x } else { /* 1,2ԪŴ1110x * 5,6ԪŴ1111x */ int b3 = biari_decode_symbol_eq_prob(p_aec); int b4 = biari_decode_symbol_eq_prob(p_aec); return 1 + (b3 << 2) + b4; } } } } /* --------------------------------------------------------------------------- * arithmetically decode the motion vector difference */ static INLINE int aec_read_mvd(aec_t *p_aec, context_t *p_ctx) { int binary_symbol = 0; int golomb_order = 0; int act_sym; if (!biari_decode_symbol(p_aec, p_ctx + 0)) { act_sym = 0; } else if (!biari_decode_symbol(p_aec, p_ctx + 1)) { act_sym = 1; } else if (!biari_decode_symbol(p_aec, p_ctx + 2)) { act_sym = 2; } else { // 1110 int add_sym = biari_decode_symbol_eq_prob(p_aec); act_sym = 0; for (;;) { int l = biari_decode_symbol_eq_prob(p_aec); AEC_RETURN_ON_ERROR(0); if (l == 0) { act_sym += (1 << golomb_order); golomb_order++; } else { break; } } while (golomb_order--) { // next binary part if (biari_decode_symbol_eq_prob(p_aec)) { binary_symbol |= (1 << golomb_order); } } act_sym += binary_symbol; act_sym = (act_sym << 1) + 3 + add_sym; } if (act_sym != 0) { if (biari_decode_symbol_eq_prob(p_aec)) { act_sym = -act_sym; } } return act_sym; } /* --------------------------------------------------------------------------- * arithmetically decode the motion vector difference */ void aec_read_mvds(aec_t *p_aec, mv_t *p_mvd) { p_mvd->x = (int16_t)aec_read_mvd(p_aec, p_aec->syn_ctx.mvd_contexts[0]); p_mvd->y = (int16_t)aec_read_mvd(p_aec, p_aec->syn_ctx.mvd_contexts[1]); } /* --------------------------------------------------------------------------- * arithmetically decode the 8x8 block type */ static INLINE int aec_read_wpm(aec_t *p_aec, int num_of_references) { context_t *p_ctx = p_aec->syn_ctx.weighted_skip_mode; return biari_decode_symbol_continu0_ext(p_aec, p_ctx, 2, num_of_references - 1); } /* --------------------------------------------------------------------------- */ static INLINE int aec_read_dir_skip_mode(aec_t *p_aec) { context_t *p_ctx = p_aec->syn_ctx.cu_subtype_index; int act_sym = biari_decode_symbol_continu0_ext(p_aec, p_ctx, 32768, 3); if (act_sym == 3) { act_sym += (!biari_decode_symbol(p_aec, p_ctx + 3)); } return act_sym; } /* --------------------------------------------------------------------------- * TU split type when TU split is enabled for current CU */ static ALWAYS_INLINE int cu_set_tu_split_type(davs2_t *h, cu_t *p_cu, int transform_split_flag) { // split types // [mode][(NSQT enable or SDIP enables) and cu_level > B8X8_IN_BIT] // split_type for block non-SDIP/NSQT:[0] and SDIP/NSQT:[1] static const int8_t TU_SPLIT_TYPE[MAX_PRED_MODES][2] = { { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 0: 8x8, ---, ---, --- (PRED_SKIP ) { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) { TU_SPLIT_CROSS, TU_SPLIT_HOR }, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) { TU_SPLIT_CROSS, TU_SPLIT_VER }, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) { TU_SPLIT_NON, TU_SPLIT_INVALID }, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) { TU_SPLIT_CROSS, TU_SPLIT_CROSS }, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) { TU_SPLIT_INVALID, TU_SPLIT_HOR }, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) { TU_SPLIT_INVALID, TU_SPLIT_VER } //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) }; int mode = p_cu->i_cu_type; int level = p_cu->i_cu_level; int enable_nsqt_sdip = IS_INTRA_MODE(mode) ? h->seq_info.enable_sdip : h->seq_info.enable_nsqt; enable_nsqt_sdip = enable_nsqt_sdip && level > B8X8_IN_BIT; p_cu->i_trans_size = transform_split_flag ? TU_SPLIT_TYPE[mode][enable_nsqt_sdip] : TU_SPLIT_NON; assert(p_cu->i_trans_size != TU_SPLIT_INVALID); return p_cu->i_trans_size; } /* --------------------------------------------------------------------------- */ int aec_read_intra_cu_type(aec_t *p_aec, cu_t *p_cu, int b_sdip, davs2_t *h) { int cu_type = PRED_I_NxN; int b_tu_split = 0; b_sdip = (p_cu->i_cu_level == B32X32_IN_BIT || p_cu->i_cu_level == B16X16_IN_BIT) && b_sdip; /* 1, read intra cu split flag */ if (p_cu->i_cu_level == B8X8_IN_BIT || b_sdip) { context_t * p_ctx = p_aec->syn_ctx.transform_split_flag; b_tu_split = biari_decode_symbol(p_aec, p_ctx + 1 + b_sdip); } #if AVS2_TRACE avs2_trace("Transform_Size = %3d \n", b_tu_split); #endif /* 2, read intra CU partition type */ if (!b_tu_split) { cu_type = PRED_I_2Nx2N; } else if (b_sdip) { context_t * p_ctx = p_aec->syn_ctx.intra_pu_type_contexts; int symbol1 = biari_decode_symbol(p_aec, p_ctx); cu_type = symbol1 ? PRED_I_2Nxn : PRED_I_nx2N; } #if AVS2_TRACE avs2_trace_string("cuType", cu_type, 1); #endif p_cu->i_cu_type = (int8_t)cu_type; cu_set_tu_split_type(h, p_cu, b_tu_split); return cu_type; } /* --------------------------------------------------------------------------- * arithmetically decode the coding unit type info of a given CU */ int aec_read_cu_type(aec_t *p_aec, cu_t *p_cu, int img_type, int b_amp, int b_mhp, int b_wsm, int num_references) { // 0: SKIP, 1: 2Nx2N, 2: 2NxN / 2NxnU / 2NxnD, 3: Nx2N / nLx2N / nRx2N, 9: INTRA static const int MAP_CU_TYPE[2][7] = { {-1, 0, 1, 2, 3, -1/*PRED_NxN*/, PRED_I_NxN}, {-1, 0, 1, 2, 3, PRED_I_NxN} }; int real_cu_type; if (img_type != AVS2_I_SLICE) { context_t *p_ctx = p_aec->syn_ctx.cu_type_contexts; int bin_idx = 0; int act_ctx = 0; int act_sym = 0; int max_bit = 6 - (p_cu->i_cu_level == B8X8_IN_BIT); int symbol; while (act_sym < max_bit) { if ((bin_idx == 5) && (p_cu->i_cu_level != MIN_CU_SIZE_IN_BIT)) { symbol = biari_decode_final(p_aec); } else { symbol = biari_decode_symbol(p_aec, p_ctx + act_ctx); } AEC_RETURN_ON_ERROR(-1); bin_idx++; if (symbol == 0) { act_sym++; act_ctx = DAVS2_MIN(5, act_ctx + 1); } else { break; } } real_cu_type = MAP_CU_TYPE[p_cu->i_cu_level == B8X8_IN_BIT][act_sym]; // for AMP if (p_cu->i_cu_level >= B16X16_IN_BIT && b_amp && (real_cu_type == 2 || real_cu_type == 3)) { context_t *p_ctx_amp = p_aec->syn_ctx.shape_of_partition_index; if (!biari_decode_symbol(p_aec, p_ctx_amp + 0)) { real_cu_type = real_cu_type * 2 + (!biari_decode_symbol(p_aec, p_ctx_amp + 1)); } } } else { real_cu_type = PRED_I_NxN; /* intra mode */ } #if AVS2_TRACE { int trace_cu_type = real_cu_type; if (trace_cu_type == PRED_I_2Nxn || trace_cu_type == PRED_I_nx2N) { trace_cu_type += 2; /* in order to trace same text as RM */ } trace_cu_type += (img_type == AVS2_B_SLICE); /* also here */ avs2_trace_string("cuType", trace_cu_type, 1); } #endif if (real_cu_type <= 0) { /* Skip Mode */ int weighted_skipmode_fix = 0; int md_directskip_mode = DS_NONE; if (img_type == AVS2_F_SLICE && b_wsm && num_references > 1) { weighted_skipmode_fix = aec_read_wpm(p_aec, num_references); #if AVS2_TRACE avs2_trace("weighted_skipmode1 = %3d \n", weighted_skipmode_fix); #endif } p_cu->i_weighted_skipmode = (int8_t)weighted_skipmode_fix; if ((weighted_skipmode_fix == 0) && ((b_mhp && img_type == AVS2_F_SLICE) || img_type == AVS2_B_SLICE)) { md_directskip_mode = aec_read_dir_skip_mode(p_aec); #if AVS2_TRACE avs2_trace("p_directskip_mode = %3d \n", md_directskip_mode); #endif } else { md_directskip_mode = DS_NONE; } p_cu->i_md_directskip_mode = (int8_t)md_directskip_mode; } return real_cu_type; } /* --------------------------------------------------------------------------- */ int aec_read_cu_type_sframe(aec_t *p_aec) { static const int MapSCUType[7] = {-1, PRED_SKIP, PRED_I_NxN}; context_t * p_ctx = p_aec->syn_ctx.cu_type_contexts; int act_ctx = 0; int cu_type = 0; for (;;) { if (biari_decode_symbol(p_aec, p_ctx + act_ctx) == 0) { cu_type++; act_ctx++; } else { break; } if (cu_type >= 2) { break; } } cu_type = MapSCUType[cu_type]; /* cu type */ #if AVS2_TRACE avs2_trace_string("cuType", cu_type, 1); #endif return cu_type; /* return cu type */ } /* --------------------------------------------------------------------------- */ static INLINE int aec_read_b_pdir(aec_t * p_aec, cu_t * p_cu) { static const int dir2offset[4][4] = { { 0, 2, 4, 9 }, { 3, 1, 5, 10 }, { 6, 7, 8, 11 }, { 12, 13, 14, 15 } }; int new_pdir[4] = { 3, 1, 0, 2 }; context_t *p_ctx = p_aec->syn_ctx.pu_type_index; int act_ctx = 0; int act_sym = 0; int pdir = PDIR_FWD; int pdir0 = 0, pdir1 = 0; int symbol; if (p_cu->i_cu_type == PRED_2Nx2N) { /* act_ctx: 0, 1, 2 */ act_sym = biari_decode_symbol_continu0_ext(p_aec, p_ctx, 32768, 2); if (act_sym == 2) { act_sym += (!biari_decode_symbol(p_aec, p_ctx + 2)); } pdir = act_sym; } else if ((p_cu->i_cu_type >= PRED_2NxN && p_cu->i_cu_type <= PRED_nRx2N) && p_cu->i_cu_level == B8X8_IN_BIT) { p_ctx = p_aec->syn_ctx.b_pu_type_min_index; pdir0 = !biari_decode_symbol(p_aec, p_ctx + act_ctx); // BW if (biari_decode_symbol(p_aec, p_ctx + act_ctx + 1)) { pdir1 = pdir0; } else { pdir1 = !pdir0; } pdir = dir2offset[pdir0][pdir1]; } else if (p_cu->i_cu_type >= PRED_2NxN || p_cu->i_cu_type <= PRED_nRx2N) { /* act_ctx: 3, 4 */ act_sym = biari_decode_symbol_continu0_ext(p_aec, p_ctx + 3, 32768, 2); /* act_ctx: 5 */ if (act_sym == 2) { act_sym += (!biari_decode_symbol(p_aec, p_ctx + 5)); } pdir0 = act_sym; if (biari_decode_symbol(p_aec, p_ctx + 6)) { pdir1 = pdir0; } else { switch (pdir0) { case 0: if (biari_decode_symbol(p_aec, p_ctx + 7)) { pdir1 = 1; } else { symbol = biari_decode_symbol(p_aec, p_ctx + 8); pdir1 = symbol ? 2 : 3; } break; case 1: if (biari_decode_symbol(p_aec, p_ctx + 9)) { pdir1 = 0; } else { symbol = biari_decode_symbol(p_aec, p_ctx + 10); pdir1 = symbol ? 2 : 3; } break; case 2: if (biari_decode_symbol(p_aec, p_ctx + 11)) { pdir1 = 0; } else { symbol = biari_decode_symbol(p_aec, p_ctx + 12); pdir1 = symbol ? 1 : 3; } break; case 3: if (biari_decode_symbol(p_aec, p_ctx + 13)) { pdir1 = 0; } else { symbol = biari_decode_symbol(p_aec, p_ctx + 14); pdir1 = symbol ? 1 : 2; } break; } } pdir0 = new_pdir[pdir0]; pdir1 = new_pdir[pdir1]; pdir = dir2offset[pdir0][pdir1]; } #if AVS2_TRACE if (p_cu->i_cu_type >= PRED_2NxN && p_cu->i_cu_type <= PRED_nRx2N) { avs2_trace_string("B_Pred_Dir0 ", pdir0, 1); avs2_trace_string("B_Pred_Dir1 ", pdir1, 1); } else if (p_cu->i_cu_type == PRED_2Nx2N) { avs2_trace_string("B_Pred_Dir ", pdir0, 1); } #endif return pdir; } /* --------------------------------------------------------------------------- * arithmetically decode the PU type */ static INLINE int aec_read_pdir_dhp(aec_t * p_aec, cu_t * p_cu) { static const int dir2offset[2][2] = { { 0, 1 }, { 2, 3 } }; context_t *p_ctx = p_aec->syn_ctx.pu_type_index; int pdir = PDIR_FWD; int pdir0, pdir1; int symbol; if (p_cu->i_cu_type == PRED_2Nx2N) { pdir = pdir0 = biari_decode_symbol(p_aec, p_ctx); } else if (p_cu->i_cu_type >= PRED_2NxN || p_cu->i_cu_type <= PRED_nRx2N) { pdir0 = biari_decode_symbol(p_aec, p_ctx + 1); symbol = biari_decode_symbol(p_aec, p_ctx + 2); if (symbol) { pdir1 = pdir0; } else { pdir1 = 1 - pdir0; } pdir = dir2offset[pdir0][pdir1]; } #if AVS2_TRACE if (p_cu->i_cu_type >= PRED_2NxN && p_cu->i_cu_type <= PRED_nRx2N) { avs2_trace_string("P_Pred_Dir0 ", pdir0, 1); avs2_trace_string("P_Pred_Dir1 ", pdir1, 1); } else if (p_cu->i_cu_type == PRED_2Nx2N) { avs2_trace_string("P_Pred_Dir ", pdir0, 1); } #endif return pdir; } /* --------------------------------------------------------------------------- * set CU prediction direction for P/F-Frames */ static INLINE void cu_set_pdir_PFframe(cu_t *p_cu, int pdir) { static const int8_t pdir0[4] = { PDIR_FWD, PDIR_FWD, PDIR_DUAL, PDIR_DUAL }; static const int8_t pdir1[4] = { PDIR_FWD, PDIR_DUAL, PDIR_FWD, PDIR_DUAL }; int i_cu_type = p_cu->i_cu_type; int i; if (i_cu_type == PRED_2Nx2N) { // 16x16 /* PU£PUΪ 1[2/3]븳ֵԼDMHģʽŽж */ pdir = (pdir == PDIR_FWD ? PDIR_FWD : PDIR_DUAL); for (i = 0; i < 4; i++) { p_cu->b8pdir[i] = (int8_t)pdir; } } else if (IS_HOR_PU_PART(i_cu_type)) { // horizontal: 16x8, 16x4, 16x12 /* ˮƽPU£PUΪ2[2/3]븳ֵԼDMHģʽŽж */ p_cu->b8pdir[0] = p_cu->b8pdir[2] = pdir0[pdir]; p_cu->b8pdir[1] = p_cu->b8pdir[3] = pdir1[pdir]; } else if (IS_VER_PU_PART(i_cu_type)) { // vertical: /* ֱPU£PUΪ2[2/3]븳ֵԼDMHģʽŽж */ p_cu->b8pdir[0] = p_cu->b8pdir[2] = pdir0[pdir]; p_cu->b8pdir[1] = p_cu->b8pdir[3] = pdir1[pdir]; } else { /* intra mode */ for (i = 0; i < 4; i++) { p_cu->b8pdir[i] = PDIR_INVALID; } } } /* --------------------------------------------------------------------------- * set CU prediction direction for B-Frames */ static INLINE void cu_set_pdir_Bframe(cu_t *p_cu, int pdir) { static const int8_t pdir0[16] = { PDIR_FWD, PDIR_BWD, PDIR_FWD, PDIR_BWD, PDIR_FWD, PDIR_BWD, PDIR_SYM, PDIR_SYM, PDIR_SYM, PDIR_FWD, PDIR_BWD, PDIR_SYM, PDIR_BID, PDIR_BID, PDIR_BID, PDIR_BID }; static const int8_t pdir1[16] = { PDIR_FWD, PDIR_BWD, PDIR_BWD, PDIR_FWD, PDIR_SYM, PDIR_SYM, PDIR_FWD, PDIR_BWD, PDIR_SYM, PDIR_BID, PDIR_BID, PDIR_BID, PDIR_FWD, PDIR_BWD, PDIR_SYM, PDIR_BID }; static const int8_t pdir2refidx[4][2] = { { B_FWD, INVALID_REF }, // PDIR_FWD { INVALID_REF, B_BWD }, // PDIR_BWD { B_FWD, B_BWD }, { B_FWD, B_BWD } }; int i_cu_type = p_cu->i_cu_type; int8_t *b8pdir = p_cu->b8pdir; int i; //--- set b8type, and b8pdir --- if (i_cu_type == PRED_SKIP) { // direct /* SkipģʽPUΪ14PU */ pdir = tab_pdir_bskip[p_cu->i_md_directskip_mode]; for (i = 0; i < 4; i++) { b8pdir[i] = (int8_t)pdir; } } else if (i_cu_type == PRED_2Nx2N) { // 16x16 /* PU£PUΪ 1 */ for (i = 0; i < 4; i++) { b8pdir[i] = (int8_t)pdir; } } else if (IS_HOR_PU_PART(i_cu_type)) { // 16x8, 16x4, 16x12 /* ˮƽPU£PUΪ2 */ b8pdir[0] = b8pdir[2] = pdir0[pdir]; b8pdir[1] = b8pdir[3] = pdir1[pdir]; } else if (IS_VER_PU_PART(i_cu_type)) { /* ֱPU£PUΪ2 */ b8pdir[0] = b8pdir[2] = pdir0[pdir]; b8pdir[1] = b8pdir[3] = pdir1[pdir]; } else { // intra mode for (i = 0; i < 4; i++) { b8pdir[i] = PDIR_INVALID; } } for (i = 0; i < 4; i++) { const int8_t *p_idx = pdir2refidx[b8pdir[i]]; p_cu->ref_idx[i].r[0] = p_idx[0]; p_cu->ref_idx[i].r[1] = p_idx[1]; } } /* --------------------------------------------------------------------------- * arithmetically decode the reference parameter of a given MB */ static INLINE int aec_read_ref_frame(aec_t *p_aec, int num_of_references) { context_t *p_ctx = p_aec->syn_ctx.pu_reference_index; int act_sym; if (biari_decode_symbol(p_aec, p_ctx)) { act_sym = 0; } else { int act_ctx = 1; act_sym = 1; // TODO: ˴ɼŻ while ((act_sym != num_of_references - 1) && (!biari_decode_symbol(p_aec, p_ctx + act_ctx))) { act_sym++; act_ctx = DAVS2_MIN(2, act_ctx + 1); } } return act_sym; } /* --------------------------------------------------------------------------- */ static INLINE int cu_read_references(davs2_t *h, aec_t *p_aec, cu_t *p_cu) { int idx_pu; int num_pu = p_cu->i_cu_type == PRED_2Nx2N ? 1 : 2; // If multiple ref. frames, read reference frame for the MB ********************************* for (idx_pu = 0; idx_pu < num_pu; idx_pu++) { int8_t ref_1st, ref_2nd; // non skip (direct) assert(p_cu->b8pdir[idx_pu] == PDIR_FWD || p_cu->b8pdir[idx_pu] == PDIR_DUAL); if (h->num_of_references > 1) { ref_1st = (int8_t)aec_read_ref_frame(p_aec, h->num_of_references); AEC_RETURN_ON_ERROR(-1); #if AVS2_TRACE avs2_trace("Fwd Ref frame no = %3d \n", ref_1st); #endif } else { ref_1st = 0; } if (p_cu->b8pdir[idx_pu] == PDIR_DUAL) { ref_2nd = !ref_1st; } else { ref_2nd = INVALID_REF; } p_cu->ref_idx[idx_pu].r[0] = ref_1st; p_cu->ref_idx[idx_pu].r[1] = ref_2nd; } return 0; } /* --------------------------------------------------------------------------- */ void aec_read_inter_pred_dir(aec_t * p_aec, cu_t *p_cu, davs2_t *h) { int pdir = PDIR_FWD; int real_cu_type = p_cu->i_cu_type; if ((h->i_frame_type == AVS2_B_SLICE)) { // B frame if (real_cu_type >= PRED_2Nx2N && real_cu_type <= PRED_nRx2N) { pdir = aec_read_b_pdir(p_aec, p_cu); } cu_set_pdir_Bframe(p_cu, pdir); } else { // other Inter frame if (IS_SKIP_MODE(real_cu_type)) { int i; if (p_cu->i_weighted_skipmode || p_cu->i_md_directskip_mode == DS_DUAL_1ST || p_cu->i_md_directskip_mode == DS_DUAL_2ND) { pdir = PDIR_DUAL; } for (i = 0; i < 4; i++) { p_cu->b8pdir[i] = (int8_t)pdir; } } else { if (h->i_frame_type == AVS2_F_SLICE && h->num_of_references > 1 && h->seq_info.enable_dhp) { if (!(p_cu->i_cu_level == B8X8_IN_BIT && real_cu_type >= PRED_2NxN && real_cu_type <= PRED_nRx2N)) { pdir = aec_read_pdir_dhp(p_aec, p_cu); } } cu_set_pdir_PFframe(p_cu, pdir); } if (h->i_frame_type != AVS2_S_SLICE && p_cu->i_cu_type != PRED_SKIP) { cu_read_references(h, p_aec, p_cu); } } } /* --------------------------------------------------------------------------- * arithmetically decode a pair of intra prediction modes of a given MB */ int aec_read_intra_pmode(aec_t * p_aec) { context_t * p_ctx = p_aec->syn_ctx.intra_luma_pred_mode; int symbol; if (biari_decode_symbol(p_aec, p_ctx) == 1) { symbol = biari_decode_symbol(p_aec, p_ctx + 6) - 2; } else { symbol = biari_decode_symbol(p_aec, p_ctx + 1) << 4; symbol += biari_decode_symbol(p_aec, p_ctx + 2) << 3; symbol += biari_decode_symbol(p_aec, p_ctx + 3) << 2; symbol += biari_decode_symbol(p_aec, p_ctx + 4) << 1; symbol += biari_decode_symbol(p_aec, p_ctx + 5); } #if AVS2_TRACE avs2_trace("@%d %s\t\t\t%d\n", symbolCount++, p_aec->tracestring, symbol); #endif return symbol; } /* --------------------------------------------------------------------------- * arithmetically decode the delta qp of a given CU */ static INLINE int aec_read_cu_delta_qp(aec_t * p_aec, int i_last_dequant) { context_t * p_ctx = p_aec->syn_ctx.delta_qp_contexts; int act_sym; int dquant; act_sym = 1 - biari_decode_symbol(p_aec, p_ctx + (!!i_last_dequant)); if (act_sym != 0) { act_sym = unary_bin_max_decode(p_aec, p_ctx + 2, 1, 256) + 1; } /* cu_qp_delta: (-32 - 4 (BitDepth-8)) (32 + 4 (BitDepth -8)) */ dquant = (act_sym + 1) >> 1; if ((act_sym & 0x01) == 0) { // LSB is signed bit dquant = -dquant; } #if AVS2_TRACE avs2_trace("@%d %s\t\t\t%d\n", symbolCount++, p_aec->tracestring, dquant); #endif return dquant; } /* --------------------------------------------------------------------------- * arithmetically decode the ctp_y[i] of a given cu */ static int aec_read_ctp_y(davs2_t *h, aec_t *p_aec, int b8, cu_t *p_cu, int scu_x, int scu_y) { context_t *p_ctx; int b_hor = p_cu->i_trans_size == TU_SPLIT_HOR; // is current CU hor TU partition int b_ver = p_cu->i_trans_size == TU_SPLIT_VER; // is current CU ver TU partition int i_level = p_cu->i_cu_level; int cu_size = 1 << i_level; int a = 0, b = 0; // ctp_y[i] of neighboring blocks int x, y; /* ǰTBCUеλ */ if (b_hor) { x = 0; y = ((cu_size * b8) >> 2); } else if (b_ver) { x = ((cu_size * b8) >> 2); y = 0; } else { x = ((cu_size * (b8 & 1)) >> 1); y = ((cu_size * (b8 >> 1)) >> 1); } /* TBͼеλ */ x += (scu_x << MIN_CU_SIZE_IN_BIT); y += (scu_y << MIN_CU_SIZE_IN_BIT); /* ת4x4λ */ x >>= MIN_PU_SIZE_IN_BIT; y >>= MIN_PU_SIZE_IN_BIT; /* ȡڿӦλõCTP */ if (b_ver && b8 > 0) { a = (p_cu->i_cbp >> (b8 - 1)) & 1; } else { a = get_neighbor_cbp_y(h, x - 1, y, scu_x, scu_y, p_cu); } /* ȡڿӦλõCTP */ if (b_hor && b8 > 0) { b = (p_cu->i_cbp >> (b8 - 1)) & 1; } else { b = get_neighbor_cbp_y(h, x, y - 1, scu_x, scu_y, p_cu); } p_ctx = p_aec->syn_ctx.cbp_contexts + a + 2 * b; return biari_decode_symbol(p_aec, p_ctx); } /* --------------------------------------------------------------------------- */ static INLINE int aec_read_cbp(aec_t *p_aec, davs2_t *h, cu_t *p_cu, int scu_x, int scu_y) { int cbp = 0; int cbp_bit = 0; if (IS_INTER(p_cu)) { if (IS_NOSKIP_INTER_MODE(p_cu->i_cu_type)) { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 8); // "ctp_zero_flag" } if (cbp_bit == 0) { // transform size int b_tu_split = biari_decode_symbol(p_aec, p_aec->syn_ctx.transform_split_flag); cu_set_tu_split_type(h, p_cu, b_tu_split); // chroma if (h->i_chroma_format != CHROMA_400) { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 4); if (cbp_bit) { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 5); if (cbp_bit) { cbp += 48; } else { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 5); cbp += (cbp_bit == 1) ? 32 : 16; } } } // luma if (b_tu_split == 0) { if (cbp == 0) { cbp = 1; // ɫȿȫ㣬ctp_zero_flagָʾзϵ } else { cbp_bit = aec_read_ctp_y(h, p_aec, 0, p_cu, scu_x, scu_y); cbp += cbp_bit; } } else { cbp_bit = aec_read_ctp_y(h, p_aec, 0, p_cu, scu_x, scu_y); cbp += cbp_bit; p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 1, p_cu, scu_x, scu_y); cbp += (cbp_bit << 1); p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 2, p_cu, scu_x, scu_y); cbp += (cbp_bit << 2); p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 3, p_cu, scu_x, scu_y); cbp += (cbp_bit << 3); p_cu->i_cbp = (int8_t)cbp; } } else { cu_set_tu_split_type(h, p_cu, 1); p_cu->i_cbp = 0; cbp = 0; } } else { // intra luma if (p_cu->i_cu_type == PRED_I_2Nx2N) { cbp = aec_read_ctp_y(h, p_aec, 0, p_cu, scu_x, scu_y); } else { cbp_bit = aec_read_ctp_y(h, p_aec, 0, p_cu, scu_x, scu_y); cbp += cbp_bit; p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 1, p_cu, scu_x, scu_y); cbp += (cbp_bit << 1); p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 2, p_cu, scu_x, scu_y); cbp += (cbp_bit << 2); p_cu->i_cbp = (int8_t)cbp; cbp_bit = aec_read_ctp_y(h, p_aec, 3, p_cu, scu_x, scu_y); cbp += (cbp_bit << 3); p_cu->i_cbp = (int8_t)cbp; } // chroma decoding if (h->i_chroma_format != CHROMA_400) { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 6); if (cbp_bit) { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 7); if (cbp_bit) { cbp += 48; } else { cbp_bit = biari_decode_symbol(p_aec, p_aec->syn_ctx.cbp_contexts + 7); cbp += 16 << cbp_bit; } } } // ɫCBP } if (!cbp) { h->i_last_dquant = 0; } #if AVS2_TRACE avs2_trace("@%d %s\t\t\t\t%d\n", symbolCount++, p_aec->tracestring, cbp); #endif return cbp; } /* --------------------------------------------------------------------------- */ int cu_read_cbp(davs2_t *h, aec_t *p_aec, cu_t *p_cu, int scu_x, int scu_y) { #if AVS2_TRACE snprintf(p_aec->tracestring, TRACESTRING_SIZE, "CBP"); #endif p_cu->i_cbp = (int8_t)aec_read_cbp(p_aec, h, p_cu, scu_x, scu_y); // check: first_mb_nr // delta quant only if nonzero coeffs if (h->b_DQP) { int i_delta_qp = 0; if (p_cu->i_cbp) { const int max_delta_qp = 32 + 4 * (h->sample_bit_depth - 8); const int min_delta_qp = -max_delta_qp; #if AVS2_TRACE snprintf(p_aec->tracestring, TRACESTRING_SIZE, "delta quant"); #endif i_delta_qp = i_delta_qp = (int8_t)aec_read_cu_delta_qp(p_aec, h->i_last_dquant); if (i_delta_qp < min_delta_qp || i_delta_qp > max_delta_qp) { i_delta_qp = DAVS2_CLIP3(min_delta_qp, max_delta_qp, i_delta_qp); davs2_log(h, DAVS2_LOG_ERROR, "Invalid cu_qp_delta: %d.", i_delta_qp); } } h->i_last_dquant = i_delta_qp; p_cu->i_qp = (int8_t)i_delta_qp + h->lcu.i_left_cu_qp; } else { p_cu->i_qp = (int8_t)h->i_qp; } AEC_RETURN_ON_ERROR(-1); return 0; } /* --------------------------------------------------------------------------- * arithmetically decode the chroma intra prediction mode of a given CU */ int aec_read_intra_pmode_c(aec_t *p_aec, davs2_t *h, int luma_mode) { context_t *p_ctx = p_aec->syn_ctx.intra_chroma_pred_mode; int act_ctx = h->lcu.c_ipred_mode_ctx; int lmode = tab_intra_mode_luma2chroma[luma_mode]; int is_redundant = lmode >= 0; int act_sym; act_sym = !biari_decode_symbol(p_aec, p_ctx + act_ctx); if (act_sym != 0) { act_sym = unary_bin_max_decode(p_aec, p_ctx + 2, 0, 3) + 1; if (is_redundant && act_sym >= lmode) { if (act_sym == 4) { davs2_log(h, DAVS2_LOG_ERROR, "Error in intra_chroma_pred_mode. (%d, %d) (%d, %d)", h->lcu.i_pix_x, h->lcu.i_pix_y, h->lcu.i_scu_x, h->lcu.i_scu_y); return 4; } act_sym++; } } #if AVS2_TRACE avs2_trace("@%d %s\t\t%d\n", symbolCount++, p_aec->tracestring, act_sym); #endif return act_sym; } /* --------------------------------------------------------------------------- */ static INLINE int aec_read_last_cg_pos(aec_t *p_aec, context_t *p_ctx, cu_t *p_cu, int *CGx, int *CGy, int b_luma, int num_cg, int is_dc_diag, int num_cg_x_minus1, int num_cg_y_minus1) { int last_cg_x = 0; int last_cg_y = 0; int last_cg_idx = 0; if (b_luma && is_dc_diag) { DAVS2_SWAP(num_cg_x_minus1, num_cg_y_minus1); } if (num_cg == 4) { // 8x8 last_cg_idx = 0; last_cg_idx += biari_decode_symbol_continu0_ext(p_aec, p_ctx, 2, 3); if (b_luma && p_cu->i_trans_size == TU_SPLIT_HOR) { last_cg_x = last_cg_idx; last_cg_y = 0; } else if (b_luma && p_cu->i_trans_size == TU_SPLIT_VER) { last_cg_x = 0; last_cg_y = last_cg_idx; } else { last_cg_x = last_cg_idx & 1; last_cg_y = last_cg_idx >> 1; } } else { // 16x16 and 32x32 int last_cg_bit; p_ctx += 3; last_cg_bit = biari_decode_symbol(p_aec, p_ctx); if (last_cg_bit == 0) { last_cg_x = 0; last_cg_y = 0; last_cg_idx = 0; } else { p_ctx++; last_cg_x = biari_decode_symbol_continue0(p_aec, p_ctx, num_cg_x_minus1); p_ctx++; if (last_cg_x == 0) { if (num_cg_y_minus1 != 1) { last_cg_y = biari_decode_symbol_continue0(p_aec, p_ctx, num_cg_y_minus1 - 1); } last_cg_y++; } else { last_cg_y = biari_decode_symbol_continue0(p_aec, p_ctx, num_cg_y_minus1); } } if (b_luma && is_dc_diag) { DAVS2_SWAP(last_cg_x, last_cg_y); } if (b_luma && p_cu->i_trans_size == TU_SPLIT_HOR) { last_cg_idx = raster2ZZ_2x8[last_cg_y * 8 + last_cg_x]; } else if (b_luma && p_cu->i_trans_size == TU_SPLIT_VER) { last_cg_idx = raster2ZZ_8x2[last_cg_y * 2 + last_cg_x]; } else if (num_cg == 16) { last_cg_idx = raster2ZZ_4x4[last_cg_y * 4 + last_cg_x]; } else { last_cg_idx = raster2ZZ_8x8[last_cg_y * 8 + last_cg_x]; } } *CGx = last_cg_x; *CGy = last_cg_y; return last_cg_idx; } /* --------------------------------------------------------------------------- */ static INLINE int aec_read_last_coeff_pos_in_cg(aec_t *p_aec, context_t *p_ctx, int rank, int cg_x, int cg_y, int b_luma, int b_one_cg, int is_dc_diag) { int xx, yy; int offset; /* AVS2-P2: 8.3.3.2.14 ȷlast_coeff_pos_x last_coeff_pos_y ctxIdxInc */ if (b_luma == 0) { // ɫȷռ12 offset = b_one_cg ? 0 : 4 + (rank == 0) * 4; } else if (b_one_cg) { // Log2TransformSize Ϊ 2ռ8 offset = 40 + is_dc_diag * 4; } else if (cg_x != 0 && cg_y != 0) { // cg_x cg_y Ϊ㣬ռ8 offset = 32 + (rank == 0) * 4; } else { // λռ40 offset = (4 * (rank == 0) + 2 * (cg_x == 0 && cg_y == 0) + is_dc_diag) * 4; } p_ctx += offset; xx = biari_decode_symbol_continu0_ext(p_aec, p_ctx, 1, 3); p_ctx += 2; yy = biari_decode_symbol_continu0_ext(p_aec, p_ctx, 1, 3); if (cg_x == 0 && cg_y > 0 && is_dc_diag) { DAVS2_SWAP(xx, yy); } if (rank != 0) { xx = 3 - xx; if (is_dc_diag) { yy = 3 - yy; } } return tab_scan_coeff_pos_in_cg[yy][xx]; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_abssum_of_n_last_coeffs(runlevel_pair_t *p_runlevel, int end_pair_pos, int start_pair_pos) { int absSum5 = 0; int n = 0; int k; for (k = end_pair_pos - 1; k >= start_pair_pos; k--) { n += p_runlevel[k].run; if (n >= 6) { break; } absSum5 += DAVS2_ABS(p_runlevel[k].level); n++; } return absSum5; } /* --------------------------------------------------------------------------- */ typedef int (*aec_read_run_f)(aec_t *p_aec, context_t *p_ctx, int pos, int b_only_one_cg, int b_1st_cg); /* --------------------------------------------------------------------------- */ static int aec_read_run_luma1(aec_t *p_aec, context_t *p_ctx, int pos, int b_only_one_cg, int b_1st_cg) { int ctxpos; int Run = 0; int offset = 0; b_only_one_cg = b_only_one_cg ? 0 : 4; for (ctxpos = 0; Run != pos; ctxpos++) { if (ctxpos < pos) { int moddiv; // 012 moddiv = (tab_scan_4x4[pos - 1 - ctxpos][1] + 1) >> 1; offset = (b_1st_cg ? (pos == ctxpos + 1 ? 0 : (1 + moddiv)) : (4 + moddiv)) + b_only_one_cg; // 0,...,10 } assert(offset >= 0 && offset < NUM_MAP_CTX); if (biari_decode_symbol(p_aec, p_ctx + offset)) { break; } Run++; } return Run; } /* --------------------------------------------------------------------------- */ static int aec_read_run_luma2(aec_t *p_aec, context_t *p_ctx, int pos, int b_only_one_cg, int b_1st_cg) { int ctxpos; int Run = 0; int offset = 0; b_only_one_cg = b_only_one_cg ? 0 : 4; for (ctxpos = 0; Run != pos; ctxpos++) { if (ctxpos < pos) { int moddiv; // 012 moddiv = ((pos < ctxpos + 4) ? 0 : (pos < ctxpos + 11 ? 1 : 2)); offset = (b_1st_cg ? (pos == ctxpos + 1 ? 0 : (1 + moddiv)) : (4 + moddiv)) + b_only_one_cg; // 0,...,10 } assert(offset >= 0 && offset < NUM_MAP_CTX); if (biari_decode_symbol(p_aec, p_ctx + offset)) { break; } Run++; } return Run; } /* --------------------------------------------------------------------------- */ static int aec_read_run_chroma(aec_t *p_aec, context_t *p_ctx, int pos, int b_only_one_cg, int b_1st_cg) { int ctxpos; int Run = 0; int offset = 0; b_only_one_cg = b_only_one_cg ? 0 : 3; for (ctxpos = 0; Run != pos; ctxpos++) { if (ctxpos < pos) { int moddiv = (pos >= 6 + ctxpos); offset = (b_1st_cg ? (pos == ctxpos + 1 ? 0 : (1 + moddiv)) : (3 + moddiv)) + b_only_one_cg; } assert(offset >= 0 && offset < NUM_MAP_CTX); if (biari_decode_symbol(p_aec, p_ctx + offset)) { break; } Run++; } return Run; } /* --------------------------------------------------------------------------- */ static int aec_read_run_level(aec_t *p_aec, cu_t *p_cu, int num_cg, int b_luma, int is_dc_diag, runlevel_t *runlevel, int scale, int shift) { static const int numOfCoeffInCG = 16; const int add = (1 << (shift - 1)); //--- read coefficients for whole block --- const int16_t(*tab_cg_scan)[2] = runlevel->cg_scan; context_t(*ctxa_run)[NUM_MAP_CTX] = runlevel->p_ctx_run; context_t *p_ctx_level = runlevel->p_ctx_level; context_t *p_ctx_nonzero_cg_flag = runlevel->p_ctx_sig_cg; context_t *p_ctx_last_cg_pos = runlevel->p_ctx_last_cg; context_t *p_ctx_last_pos_in_cg = runlevel->p_ctx_last_pos_in_cg; runlevel_pair_t *p_runlevel = runlevel->run_level; int idx_cg; int cg_pos = 0; int CGx = 0; int CGy = 0; int b_only_one_cg = (num_cg == 1); int8_t dct_pattern = DCT_QUAD; int w_tr_half, w_tr_quad; // CG position limitation int h_tr_half, h_tr_quad; // CG position limitation int w_tr = runlevel->w_tr; int h_tr = runlevel->h_tr; #if AVS2_TRACE int idx_runlevel = 0; #endif int rank = 0; aec_read_run_f f_read_run = b_luma ? (!is_dc_diag ? aec_read_run_luma1 : aec_read_run_luma2) : aec_read_run_chroma; /* dct_pattern_e */ if (w_tr == h_tr) { w_tr_half = w_tr >> 1; h_tr_half = h_tr >> 1; w_tr_quad = w_tr >> 2; h_tr_quad = h_tr >> 2; } else if (w_tr > h_tr) { w_tr_half = w_tr >> 1; h_tr_half = h_tr >> 0; w_tr_quad = w_tr >> 2; h_tr_quad = h_tr >> 0; } else { w_tr_half = w_tr >> 0; h_tr_half = h_tr >> 1; w_tr_quad = w_tr >> 0; h_tr_quad = h_tr >> 2; } /* תCGλı߽λ */ w_tr_half >>= 2; h_tr_half >>= 2; w_tr_quad >>= 2; h_tr_quad >>= 2; /* 1, read last CG position */ if (num_cg > 1) { int num_cg_x_minus1 = tab_cg_scan[num_cg - 1][0]; int num_cg_y_minus1 = tab_cg_scan[num_cg - 1][1]; cg_pos = aec_read_last_cg_pos(p_aec, p_ctx_last_cg_pos, p_cu, &CGx, &CGy, b_luma, num_cg, is_dc_diag, num_cg_x_minus1, num_cg_y_minus1); } num_cg = cg_pos + 1; runlevel->num_nonzero_cg = num_cg; /* 2, read coefficients in each CG */ for (idx_cg = 0; idx_cg < num_cg; idx_cg++) { int b_1st_cg = (cg_pos == 0); int nonzero_cg_flag = 1; /* 2.1, sig CG flag */ if (rank > 0) { /* update CG position */ int ctx_sig_cg = (b_luma && cg_pos != 0); CGx = tab_cg_scan[cg_pos][0]; CGy = tab_cg_scan[cg_pos][1]; nonzero_cg_flag = biari_decode_symbol(p_aec, p_ctx_nonzero_cg_flag + ctx_sig_cg); } /* 2.2, coefficients in CG */ if (nonzero_cg_flag) { int num_pairs_in_cg = 0; int i; // last in CG int pos = aec_read_last_coeff_pos_in_cg(p_aec, p_ctx_last_pos_in_cg, rank, CGx, CGy, b_luma, b_only_one_cg, is_dc_diag); for (i = -numOfCoeffInCG; i != 0; i++) { // level int Run = 0; int Level = 1; int absSum5; context_t *p_ctx; /* coeff_level_minus1_band[j] */ if (biari_decode_final(p_aec)) { int golomb_order = 0; int binary_symbol = 0; for (;;) { int l = biari_decode_symbol_eq_prob(p_aec); AEC_RETURN_ON_ERROR(-1); if (l) { break; } Level += (1 << golomb_order); golomb_order++; } while (golomb_order--) { // next binary part int sig = biari_decode_symbol_eq_prob(p_aec); binary_symbol |= (sig << golomb_order); } Level += binary_symbol; Level += 32; } else { int pairsInCGIdx = (num_pairs_in_cg + 1) >> 1; pairsInCGIdx = DAVS2_MIN(2, pairsInCGIdx); p_ctx = p_ctx_level; p_ctx += 10 * (b_1st_cg && pos < 3) + DAVS2_MIN(rank, pairsInCGIdx + 2) + ((5 * pairsInCGIdx) >> 1); Level += biari_decode_symbol_continue0(p_aec, p_ctx, 31); } AEC_RETURN_ON_ERROR(-1); absSum5 = get_abssum_of_n_last_coeffs(p_runlevel, num_pairs_in_cg, 0); absSum5 = (absSum5 + Level) >> 1; p_ctx = ctxa_run[DAVS2_MIN(absSum5, 2)]; // run Run = 0; if (pos > 0) { Run = f_read_run(p_aec, p_ctx, pos, b_only_one_cg, b_1st_cg); } AEC_RETURN_ON_ERROR(-1); #if AVS2_TRACE if (b_luma) { avs2_trace(" Luma8x8 sng"); avs2_trace("(%2d) level =%3d run =%2d\n", idx_runlevel, level, run); } else { avs2_trace(" AC chroma 8X8 "); avs2_trace("%2d: level =%3d run =%2d\n", idx_runlevel, level, run); } idx_runlevel++; #endif p_runlevel[num_pairs_in_cg].level = (int16_t)Level; p_runlevel[num_pairs_in_cg].run = (int16_t)Run; num_pairs_in_cg++; if (Level > T_Chr[rank]) { rank = tab_rank[DAVS2_MIN(5, Level)]; } if (Run == pos) { break; } pos -= (Run + 1); } // for (i = -numOfCoeffInCG; i != 0; i++) // sign of level for (i = 0; i < num_pairs_in_cg; i++) { if (biari_decode_symbol_eq_prob(p_aec)) { p_runlevel[i].level = -p_runlevel[i].level; } } /* convert run-level to coefficients */ { const int b_swap_xy = runlevel->b_swap_xy; const int i_coeff = runlevel->i_res; coeff_t *p_res = runlevel->p_res; int num_pairs = num_pairs_in_cg; int coef_ctr = -1; if (b_swap_xy) { DAVS2_SWAP(CGx, CGy); } p_res += i_coeff * (CGy << 2) + (CGx << 2); // RunLevelתCGڵķϵ while (num_pairs > 0) { /* leave if len=1 */ int x_in_cg, y_in_cg; int level = p_runlevel[num_pairs - 1].level; int run = p_runlevel[num_pairs - 1].run; num_pairs--; if (run < 0 || run >= 16) { // davs2_log(h, DAVS2_LOG_ERROR, "wrong run level."); return -1; } coef_ctr += run + 1; x_in_cg = tab_scan_4x4[coef_ctr][ b_swap_xy]; y_in_cg = tab_scan_4x4[coef_ctr][!b_swap_xy]; level = (level * scale + add) >> shift; p_res[y_in_cg * i_coeff + x_in_cg] = (coeff_t)DAVS2_CLIP3(-32768, 32767, level); } if (CGy >= h_tr_half || CGx >= w_tr_half) { dct_pattern = DCT_DEAULT; } else if ((CGy >= h_tr_quad || CGx >= w_tr_quad) && dct_pattern != DCT_DEAULT) { dct_pattern = DCT_HALF; } } } // end of reading one CG cg_pos--; } // end of reading all CGs return dct_pattern; } /* --------------------------------------------------------------------------- * get coefficients of one block */ int8_t cu_get_block_coeffs(aec_t *p_aec, runlevel_t *runlevel, cu_t *p_cu, coeff_t *p_res, int w_tr, int h_tr, int i_tu_level, int b_luma, int intra_pred_class, int b_swap_xy, int scale, int shift, int wq_size_id) { int num_coeffs = w_tr * h_tr; int num_cg = num_coeffs >> 4; runlevel->p_res = p_res; runlevel->i_res = w_tr; runlevel->b_swap_xy = b_swap_xy; runlevel->i_tu_level = i_tu_level; runlevel->w_tr = w_tr; runlevel->h_tr = h_tr; UNUSED_PARAMETER(wq_size_id); return (int8_t)aec_read_run_level(p_aec, p_cu, num_cg, b_luma, intra_pred_class == INTRA_PRED_DC_DIAG, runlevel, scale, shift); } /* --------------------------------------------------------------------------- * finding end of a slice in case this is not the end of a frame * * unsure whether the "correction" below actually solves an off-by-one * problem or whether it introduces one in some cases :-( Anyway, * with this change the bit stream format works with AEC again. */ int aec_startcode_follows(aec_t *p_aec, int eos_bit) { int bit = 0; if (eos_bit) { bit = biari_decode_final(p_aec); #if AVS2_TRACE avs2_trace("@%d %s\t\t%d\n", symbolCount++, "Decode Sliceterm", bit); #endif } /* the best way to be sure that the current slice is end, * is to check if a start code is followed */ return bit; } /* --------------------------------------------------------------------------- */ int aec_read_split_flag(aec_t *p_aec, int i_level) { context_t *p_ctx = p_aec->syn_ctx.cu_split_flag + (i_level - MIN_CU_SIZE_IN_BIT - 1); int split_flag = biari_decode_symbol(p_aec, p_ctx); #if AVS2_TRACE avs2_trace("SplitFlag = %3d\n", split_flag); #endif return split_flag; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int read_sao_mergeflag(aec_t *p_aec, int act_ctx) { int act_sym = 0; if (act_ctx == 1) { act_sym = biari_decode_symbol(p_aec, &p_aec->syn_ctx.sao_mergeflag_context[0]); } else if (act_ctx == 2) { act_sym = biari_decode_symbol(p_aec, &p_aec->syn_ctx.sao_mergeflag_context[1]); if (act_sym != 1) { act_sym += (biari_decode_symbol(p_aec, &p_aec->syn_ctx.sao_mergeflag_context[2]) << 1); } } return act_sym; } /* --------------------------------------------------------------------------- */ int aec_read_sao_mergeflag(aec_t *p_aec, int mergeleft_avail, int mergeup_avail) { int merge_left = 0; int merge_top = 0; int merge_index = read_sao_mergeflag(p_aec, mergeleft_avail + mergeup_avail); assert(merge_index <= 2); if (mergeleft_avail) { merge_left = merge_index & 0x01; merge_index = merge_index >> 1; } if (mergeup_avail && !merge_left) { merge_top = merge_index & 0x01; } return (merge_left << 1) + merge_top; } /* --------------------------------------------------------------------------- */ int aec_read_sao_mode(aec_t *p_aec) { int t2 = !biari_decode_symbol(p_aec, p_aec->syn_ctx.sao_mode_context); int act_sym; if (t2) { int t1 = !biari_decode_symbol_eq_prob(p_aec); act_sym = t2 + (t1 << 1); } else { act_sym = 0; } return act_sym; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int read_sao_offset(aec_t *p_aec, int offset_type) { int maxvalue = saoclip[offset_type][2]; int cnt = 0; int act_sym, sym; if (offset_type == SAO_CLASS_BO) { sym = !biari_decode_symbol(p_aec, &p_aec->syn_ctx.sao_offset_context[0]); } else { sym = !biari_decode_symbol_eq_prob(p_aec); } while (sym) { cnt++; if (cnt == maxvalue) { break; } sym = !biari_decode_symbol_eq_prob(p_aec); } if (offset_type == SAO_CLASS_EO_FULL_VALLEY) { act_sym = EO_OFFSET_INV__MAP[cnt]; } else if (offset_type == SAO_CLASS_EO_FULL_PEAK) { act_sym = -EO_OFFSET_INV__MAP[cnt]; } else if (offset_type == SAO_CLASS_EO_HALF_PEAK) { act_sym = -cnt; } else { act_sym = cnt; } if (offset_type == SAO_CLASS_BO && act_sym) { if (biari_decode_symbol_eq_prob(p_aec)) { // sign symbol act_sym = -act_sym; } } return act_sym; } /* --------------------------------------------------------------------------- */ void aec_read_sao_offsets(aec_t *p_aec, sao_param_t *p_sao_param, int *offset) { int i; assert(p_sao_param->modeIdc == SAO_MODE_NEW); for (i = 0; i < 4; i++) { int offset_type; if (p_sao_param->typeIdc == SAO_TYPE_BO) { offset_type = SAO_CLASS_BO; } else { offset_type = (i >= 2) ? (i + 1) : i; } offset[i] = read_sao_offset(p_aec, offset_type); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int read_sao_type(aec_t *p_aec, int act_ctx) { int act_sym = 0; int golomb_order = 1; int length; if (act_ctx == 0) { length = NUM_SAO_EO_TYPES_LOG2; } else if (act_ctx == 1) { length = NUM_SAO_BO_CLASSES_LOG2; } else { assert(act_ctx == 2); length = NUM_SAO_BO_CLASSES_LOG2 - 1; } if (act_ctx == 2) { int temp; int rest; do { temp = biari_decode_symbol_eq_prob(p_aec); AEC_RETURN_ON_ERROR(-1); if (temp == 0) { act_sym += (1 << golomb_order); golomb_order++; } if (golomb_order == 4) { golomb_order = 0; temp = 1; } } while (temp != 1); rest = 0; while (golomb_order--) { // next binary part temp = biari_decode_symbol_eq_prob(p_aec); if (temp == 1) { rest |= (temp << golomb_order); } } act_sym += rest; } else { int i; for (i = 0; i < length; i++) { act_sym = act_sym + (biari_decode_symbol_eq_prob(p_aec) << i); } } return act_sym; } /* --------------------------------------------------------------------------- */ int aec_read_sao_type(aec_t *p_aec, sao_param_t *p_sao_param) { int stBnd[2]; assert(p_sao_param->modeIdc == SAO_MODE_NEW); if (p_sao_param->typeIdc == SAO_TYPE_BO) { stBnd[0] = read_sao_type(p_aec, 1); // read delta start band for BO stBnd[1] = read_sao_type(p_aec, 2) + 2; return (stBnd[0] + (stBnd[1] << NUM_SAO_BO_CLASSES_LOG2)); } else { assert(p_sao_param->typeIdc == SAO_TYPE_EO_0); return read_sao_type(p_aec, 0); } } /* --------------------------------------------------------------------------- */ int aec_read_alf_lcu_ctrl(aec_t *p_aec) { context_t *ctx = p_aec->syn_ctx.alf_lcu_enable_scmodel; return biari_decode_symbol(p_aec, ctx); } davs2-1.6/source/common/aec.h000066400000000000000000000130451337322544400161020ustar00rootroot00000000000000/* * aec.h * * Description of this file: * AEC functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_AEC_H #define DAVS2_AEC_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- * global variables */ #define saoclip FPFX(saoclip) extern const int saoclip[NUM_SAO_OFFSET][3]; #define tab_intra_mode_scan_type FPFX(tab_intra_mode_scan_type) extern const int tab_intra_mode_scan_type[NUM_INTRA_MODE]; /* --------------------------------------------------------------------------- * aec basic operations */ #define aec_init_contexts FPFX(aec_init_contexts) void aec_init_contexts (aec_t *p_aec); #define aec_new_slice FPFX(aec_new_slice) void aec_new_slice (davs2_t *h); #define aec_start_decoding FPFX(aec_start_decoding) int aec_start_decoding (aec_t *p_aec, uint8_t *p_start, int i_byte_pos, int i_bytes); #define aec_bits_read FPFX(aec_bits_read) int aec_bits_read (aec_t *p_aec); #define aec_startcode_follows FPFX(aec_startcode_follows) int aec_startcode_follows (aec_t *p_aec, int eos_bit); /* --------------------------------------------------------------------------- * ctu structure information */ #define aec_read_split_flag FPFX(aec_read_split_flag) int aec_read_split_flag (aec_t *p_aec, int i_level); /* --------------------------------------------------------------------------- * cu type information */ #define aec_read_cu_type FPFX(aec_read_cu_type) int aec_read_cu_type (aec_t *p_aec, cu_t *p_cu, int img_type, int b_amp, int b_mhp, int b_wsm, int num_references); #define aec_read_cu_type_sframe FPFX(aec_read_cu_type_sframe) int aec_read_cu_type_sframe(aec_t *p_aec); #define aec_read_intra_cu_type FPFX(aec_read_intra_cu_type) int aec_read_intra_cu_type (aec_t *p_aec, cu_t *p_cu, int b_sdip, davs2_t *h); /* --------------------------------------------------------------------------- * inter prediction information */ #define aec_read_dmh_mode FPFX(aec_read_dmh_mode) int aec_read_dmh_mode (aec_t *p_aec, int i_cu_level); #define aec_read_mvds FPFX(aec_read_mvds) void aec_read_mvds (aec_t *p_aec, mv_t *p_mvd); #define aec_read_inter_pred_dir FPFX(aec_read_inter_pred_dir) void aec_read_inter_pred_dir(aec_t * p_aec, cu_t *p_cu, davs2_t *h); /* --------------------------------------------------------------------------- * intra prediction information */ #define aec_read_intra_pmode FPFX(aec_read_intra_pmode) int aec_read_intra_pmode (aec_t *p_aec); #define aec_read_intra_pmode_c FPFX(aec_read_intra_pmode_c) int aec_read_intra_pmode_c (aec_t *p_aec, davs2_t *h, int luma_mode); /* --------------------------------------------------------------------------- * transform unit (residual) information */ #define cu_read_cbp FPFX(cu_read_cbp) int cu_read_cbp (davs2_t *h, aec_t *p_aec, cu_t *p_cu, int scu_x, int scu_y); #define cu_get_block_coeffs FPFX(cu_get_block_coeffs) int8_t cu_get_block_coeffs (aec_t *p_aec, runlevel_t *runlevel, cu_t *p_cu, coeff_t *p_res, int w_tr, int h_tr, int i_tu_level, int b_luma, int intra_pred_class, int b_swap_xy, int scale, int shift, int wq_size_id); /* --------------------------------------------------------------------------- * loop filter information */ #define aec_read_sao_mergeflag FPFX(aec_read_sao_mergeflag) int aec_read_sao_mergeflag (aec_t *p_aec, int mergeleft_avail, int mergeup_avail); #define aec_read_sao_mode FPFX(aec_read_sao_mode) int aec_read_sao_mode (aec_t *p_aec); #define aec_read_sao_offsets FPFX(aec_read_sao_offsets) void aec_read_sao_offsets (aec_t *p_aec, sao_param_t *p_sao_param, int *offset); #define aec_read_sao_type FPFX(aec_read_sao_type) int aec_read_sao_type (aec_t *p_aec, sao_param_t *p_sao_param); #define aec_read_alf_lcu_ctrl FPFX(aec_read_alf_lcu_ctrl) int aec_read_alf_lcu_ctrl (aec_t *p_aec); #ifndef AEC_RETURN_ON_ERROR #define AEC_RETURN_ON_ERROR(ret_code) \ if (p_aec->b_bit_error) {\ p_aec->b_bit_error = FALSE; /* reset error flag */\ /* davs2_log(h, DAVS2_LOG_ERROR, "aec decoding error."); */\ return (ret_code);\ } #endif #ifdef __cplusplus } #endif #endif // DAVS2_AEC_H davs2-1.6/source/common/alf.cc000066400000000000000000000474121337322544400162570ustar00rootroot00000000000000/* * alf.cc * * Description of this file: * ALF functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "alf.h" #include "aec.h" #include "vlc.h" #include "frame.h" #if HAVE_MMX #include "vec/intrinsic.h" #endif /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void alf_recon_coefficients(alf_param_t *p_alf_param, int p_filter_coeff[][ALF_MAX_NUM_COEF]) { int num_coeff = p_alf_param->num_coeff - 1; int alf_num = 1 << ALF_NUM_BIT_SHIFT; int sum; int i, j; for (j = 0; j < p_alf_param->filters_per_group; j++) { sum = 0; for (i = 0; i < num_coeff; i++) { sum += (2 * p_alf_param->coeffmulti[j][i]); p_filter_coeff[j][i] = p_alf_param->coeffmulti[j][i]; } p_filter_coeff[j][num_coeff] = (alf_num - sum) + p_alf_param->coeffmulti[j][num_coeff]; } } /* --------------------------------------------------------------------------- */ static void alf_init_var_table(alf_param_t *p_alf_param, int *p_var_tab) { if (p_alf_param->filters_per_group > 1) { int i; p_var_tab[0] = 0; for (i = 1; i < ALF_NUM_VARS; ++i) { p_var_tab[i] = (p_alf_param->filterPattern[i]) ? (p_var_tab[i - 1] + 1) : p_var_tab[i - 1]; } } else { memset(p_var_tab, 0, ALF_NUM_VARS * sizeof(int)); } } /* --------------------------------------------------------------------------- */ static void alf_filter_block1(pel_t *p_dst, const pel_t *p_src, int stride, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { const int pel_add = 1 << (ALF_NUM_BIT_SHIFT - 1); const int pel_max = max_pel_value; const int min_x = -3; const int max_x = lcu_width - 1 + 3; int x, y; const pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; { int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); p_src += (startPos * stride) + lcu_pix_x; p_dst += (startPos * stride) + lcu_pix_x; lcu_height = endPos - startPos; lcu_height--; } for (y = 0; y <= lcu_height; y++) { int yUp, yBottom; yUp = DAVS2_CLIP3(0, lcu_height, y - 1); yBottom = DAVS2_CLIP3(0, lcu_height, y + 1); imgPad1 = p_src + (yBottom - y) * stride; imgPad2 = p_src + (yUp - y) * stride; yUp = DAVS2_CLIP3(0, lcu_height, y - 2); yBottom = DAVS2_CLIP3(0, lcu_height, y + 2); imgPad3 = p_src + (yBottom - y) * stride; imgPad4 = p_src + (yUp - y) * stride; yUp = DAVS2_CLIP3(0, lcu_height, y - 3); yBottom = DAVS2_CLIP3(0, lcu_height, y + 3); imgPad5 = p_src + (yBottom - y) * stride; imgPad6 = p_src + (yUp - y) * stride; for (x = 0; x < lcu_width; x++) { int xLeft, xRight; int pel_val; pel_val = alf_coeff[0] * (imgPad5[x] + imgPad6[x]); pel_val += alf_coeff[1] * (imgPad3[x] + imgPad4[x]); xLeft = DAVS2_CLIP3(min_x, max_x, x - 1); xRight = DAVS2_CLIP3(min_x, max_x, x + 1); pel_val += alf_coeff[2] * (imgPad1[xRight] + imgPad2[xLeft ]); pel_val += alf_coeff[3] * (imgPad1[x ] + imgPad2[x ]); pel_val += alf_coeff[4] * (imgPad1[xLeft ] + imgPad2[xRight]); pel_val += alf_coeff[7] * (p_src [xRight] + p_src [xLeft ]); xLeft = DAVS2_CLIP3(min_x, max_x, x - 2); xRight = DAVS2_CLIP3(min_x, max_x, x + 2); pel_val += alf_coeff[6] * (p_src [xRight] + p_src [xLeft ]); xLeft = DAVS2_CLIP3(min_x, max_x, x - 3); xRight = DAVS2_CLIP3(min_x, max_x, x + 3); pel_val += alf_coeff[5] * (p_src [xRight] + p_src [xLeft ]); pel_val += alf_coeff[8] * (p_src [x ]); pel_val = (pel_val + pel_add) >> ALF_NUM_BIT_SHIFT; p_dst[x] = (pel_t)DAVS2_CLIP3(0, pel_max, pel_val); } p_src += stride; p_dst += stride; } } /* --------------------------------------------------------------------------- */ static void alf_filter_block2(pel_t *p_dst, const pel_t *p_src, int i_src, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { const pel_t *p_src1, *p_src2, *p_src3, *p_src4, *p_src5, *p_src6; int i_dst = i_src; int pixelInt; int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); /* first line */ p_src += (startPos * i_src) + lcu_pix_x; p_dst += (startPos * i_dst) + lcu_pix_x; if (p_src[0] != p_src[-1]) { p_src1 = p_src + 1 * i_src; p_src2 = p_src; p_src3 = p_src + 2 * i_src; p_src4 = p_src; p_src5 = p_src + 3 * i_src; p_src6 = p_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[ 0]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)DAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; p_dst += lcu_width - 1; if (p_src[0] != p_src[1]) { p_src1 = p_src + 1 * i_src; p_src2 = p_src; p_src3 = p_src + 2 * i_src; p_src4 = p_src; p_src5 = p_src + 3 * i_src; p_src6 = p_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 0]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)DAVS2_CLIP1(pixelInt); } /* last line */ p_src -= lcu_width - 1; p_dst -= lcu_width - 1; p_src += ((endPos - startPos - 1) * i_src); p_dst += ((endPos - startPos - 1) * i_dst); if (p_src[0] != p_src[-1]) { p_src1 = p_src; p_src2 = p_src - 1 * i_src; p_src3 = p_src; p_src4 = p_src - 2 * i_src; p_src5 = p_src; p_src6 = p_src - 3 * i_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 1] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[ 0] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)DAVS2_CLIP1(pixelInt); } p_src += lcu_width - 1; p_dst += lcu_width - 1; if (p_src[0] != p_src[1]) { p_src1 = p_src; p_src2 = p_src - 1 * i_src; p_src3 = p_src; p_src4 = p_src - 2 * i_src; p_src5 = p_src; p_src6 = p_src - 3 * i_src; pixelInt = alf_coeff[0] * (p_src5[ 0] + p_src6[ 0]); pixelInt += alf_coeff[1] * (p_src3[ 0] + p_src4[ 0]); pixelInt += alf_coeff[2] * (p_src1[ 0] + p_src2[-1]); pixelInt += alf_coeff[3] * (p_src1[ 0] + p_src2[ 0]); pixelInt += alf_coeff[4] * (p_src1[-1] + p_src2[ 1]); pixelInt += alf_coeff[7] * (p_src [ 1] + p_src [-1]); pixelInt += alf_coeff[6] * (p_src [ 2] + p_src [-2]); pixelInt += alf_coeff[5] * (p_src [ 3] + p_src [-3]); pixelInt += alf_coeff[8] * (p_src [ 0]); pixelInt = (int)((pixelInt + 32) >> 6); p_dst[0] = (pel_t)DAVS2_CLIP1(pixelInt); } } /* --------------------------------------------------------------------------- */ static void deriveBoundaryAvail(davs2_t *h, int lcu_xy, int width_in_lcu, int height_in_lcu, int *b_top_avail, int *b_down_avail) { *b_top_avail = (lcu_xy >= width_in_lcu); *b_down_avail = (lcu_xy < (height_in_lcu - 1) * width_in_lcu); if (!h->seq_info.cross_loop_filter_flag) { int width_in_scu = h->i_width_in_scu; int lcu_pic_x = (lcu_xy % width_in_lcu) << h->i_lcu_level; int lcu_pic_y = (lcu_xy / width_in_lcu) << h->i_lcu_level; int scu_xy = (lcu_pic_y >> MIN_CU_SIZE_IN_BIT) * width_in_scu + (lcu_pic_x >> MIN_CU_SIZE_IN_BIT); // int scu_xy_next_row = scu_xy + (1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT)) * width_in_scu; int slice_idx_top = *b_top_avail ? h->scu_data[scu_xy - width_in_scu].i_slice_nr : -1; // int slice_idx_down = *b_down_avail ? h->scu_data[scu_xy_next_row].i_slice_nr : -1; int slcie_idx_cur = h->scu_data[scu_xy].i_slice_nr; *b_top_avail = (slcie_idx_cur == slice_idx_top) ? TRUE : FALSE; // *b_down_avail = (slcie_idx_cur == slice_idx_down) ? TRUE : FALSE; } } /* --------------------------------------------------------------------------- */ static void alf_param_init(alf_param_t *alf_par, int cID) { alf_par->num_coeff = ALF_MAX_NUM_COEF; alf_par->filters_per_group = 1; alf_par->componentID = cID; memset(alf_par->filterPattern, 0, sizeof(alf_par->filterPattern)); memset(alf_par->coeffmulti, 0, sizeof(alf_par->coeffmulti)); } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ size_t alf_get_buffer_size(davs2_t *h) { size_t width_in_lcu = h->i_width_in_lcu; size_t height_in_lcu = h->i_height_in_lcu; return sizeof(alf_var_t) + height_in_lcu * width_in_lcu * sizeof(uint8_t); } /* --------------------------------------------------------------------------- */ void alf_init_buffer(davs2_t *h) { static const uint8_t regionTable[ALF_NUM_VARS] = { 0, 1, 4, 5, 15, 2, 3, 6, 14, 11, 10, 7, 13, 12, 9, 8 }; int width_in_lcu = h->i_width_in_lcu; int height_in_lcu = h->i_height_in_lcu; int quad_w_in_lcu = ((width_in_lcu + 1) >> 2); int quad_h_in_lcu = ((height_in_lcu + 1) >> 2); int region_idx_x; int region_idx_y; int i, j; uint8_t *mem_ptr = (uint8_t *)h->p_alf; h->p_alf->tab_lcu_region = (uint8_t *)(mem_ptr + sizeof(alf_var_t)); memset(h->p_alf->filterCoeffSym, 0, sizeof(h->p_alf->filterCoeffSym)); for (j = 0; j < height_in_lcu; j++) { region_idx_y = (quad_h_in_lcu == 0) ? 3 : DAVS2_MIN(j / quad_h_in_lcu, 3); for (i = 0; i < width_in_lcu; i++) { region_idx_x = (quad_w_in_lcu == 0) ? 3 : DAVS2_MIN(i / quad_w_in_lcu, 3); h->p_alf->tab_lcu_region[j * width_in_lcu + i] = regionTable[region_idx_y * 4 + region_idx_x]; } } for (i = 0; i < IMG_COMPONENTS; i++) { alf_param_init(&h->p_alf->img_param[i], i); } } /* --------------------------------------------------------------------------- */ static void vlc_read_alf_coeff(davs2_bs_t *bs, alf_param_t *alf_param) { const int numCoeff = ALF_MAX_NUM_COEF; int f, symbol, pre_symbole; int pos; switch (alf_param->componentID) { case IMG_U: case IMG_V: for (pos = 0; pos < numCoeff; pos++) { alf_param->coeffmulti[0][pos] = se_v(bs, "Chroma ALF coefficients"); } break; case IMG_Y: alf_param->filters_per_group = ue_v(bs, "ALF filter number"); alf_param->filters_per_group = alf_param->filters_per_group + 1; memset(alf_param->filterPattern, 0, ALF_NUM_VARS * sizeof(int)); pre_symbole = 0; symbol = 0; for (f = 0; f < alf_param->filters_per_group; f++) { if (f > 0) { if (alf_param->filters_per_group != 16) { symbol = ue_v(bs, "Region distance"); } else { symbol = 1; } alf_param->filterPattern[symbol + pre_symbole] = 1; pre_symbole += symbol; } for (pos = 0; pos < numCoeff; pos++) { alf_param->coeffmulti[f][pos] = se_v(bs, "Luma ALF coefficients"); } } break; default: /// Not a legal component ID assert(0); exit(-1); } } /* --------------------------------------------------------------------------- */ void alf_read_param(davs2_t *h, davs2_bs_t *bs) { if (h->b_alf) { h->pic_alf_on[IMG_Y] = u_flag(bs, "alf_pic_flag_Y"); h->pic_alf_on[IMG_U] = u_flag(bs, "alf_pic_flag_Cb"); h->pic_alf_on[IMG_V] = u_flag(bs, "alf_pic_flag_Cr"); if (h->pic_alf_on[0] || h->pic_alf_on[1] || h->pic_alf_on[2]) { int component_idx; for (component_idx = 0; component_idx < IMG_COMPONENTS; component_idx++) { if (h->pic_alf_on[component_idx]) { vlc_read_alf_coeff(bs, &h->p_alf->img_param[component_idx]); } } } } } /* --------------------------------------------------------------------------- * ALF one LCU block */ static void alf_lcu_block(davs2_t *h, alf_param_t *p_alf_param, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_x, int i_lcu_y) { int lcu_size = h->i_lcu_size; int img_height = h->i_height; int img_width = h->i_width; int width_in_lcu = h->i_width_in_lcu; int height_in_lcu = h->i_height_in_lcu; int lcu_pix_x = i_lcu_x << h->i_lcu_level; int lcu_pix_y = i_lcu_y << h->i_lcu_level; int lcu_width = (lcu_pix_x + lcu_size > img_width ) ? (img_width - lcu_pix_x) : lcu_size; int lcu_height = (lcu_pix_y + lcu_size > img_height) ? (img_height - lcu_pix_y) : lcu_size; int lcu_xy = i_lcu_y * width_in_lcu + i_lcu_x; int b_top_avail, b_down_avail; int lcu_region_idx = h->p_alf->tab_lcu_region[lcu_xy]; int *alf_coef; // derive CTU boundary availabilities deriveBoundaryAvail(h, lcu_xy, width_in_lcu, height_in_lcu, &b_top_avail, &b_down_avail); if (h->lcu_infos[lcu_xy].enable_alf[0]) { alf_init_var_table(&p_alf_param[0], h->p_alf->tab_region_coeff_idx); // reconstruct ALF coefficients & related parameters alf_recon_coefficients(&p_alf_param[0], h->p_alf->filterCoeffSym); alf_coef = h->p_alf->filterCoeffSym[h->p_alf->tab_region_coeff_idx[lcu_region_idx]]; gf_davs2.alf_block[0](p_dec_frm->planes[0], p_tmp_frm->planes[0], p_dec_frm->i_stride[0], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); gf_davs2.alf_block[1](p_dec_frm->planes[0], p_tmp_frm->planes[0], p_dec_frm->i_stride[0], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); } lcu_pix_x >>= 1; lcu_pix_y >>= 1; lcu_width >>= 1; lcu_height >>= 1; if (h->lcu_infos[lcu_xy].enable_alf[1]) { // reconstruct ALF coefficients & related parameters alf_recon_coefficients(&p_alf_param[1], h->p_alf->filterCoeffSym); alf_coef = h->p_alf->filterCoeffSym[0]; gf_davs2.alf_block[0](p_dec_frm->planes[1], p_tmp_frm->planes[1], p_dec_frm->i_stride[1], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); gf_davs2.alf_block[1](p_dec_frm->planes[1], p_tmp_frm->planes[1], p_dec_frm->i_stride[1], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); } if (h->lcu_infos[lcu_xy].enable_alf[2]) { // reconstruct ALF coefficients & related parameters alf_recon_coefficients(&p_alf_param[2], h->p_alf->filterCoeffSym); alf_coef = h->p_alf->filterCoeffSym[0]; gf_davs2.alf_block[0](p_dec_frm->planes[2], p_tmp_frm->planes[2], p_dec_frm->i_stride[2], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); gf_davs2.alf_block[1](p_dec_frm->planes[2], p_tmp_frm->planes[2], p_dec_frm->i_stride[2], lcu_pix_x, lcu_pix_y, lcu_width, lcu_height, alf_coef, b_top_avail, b_down_avail); } } /* --------------------------------------------------------------------------- */ void alf_lcurow(davs2_t *h, alf_param_t *p_alf_param, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_y) { const int w_in_lcu = h->i_width_in_lcu; int i_lcu_x; /* copy one decoded LCU-row (with padding left and right edges) */ davs2_frame_copy_lcurow(h, p_tmp_frm, p_dec_frm, i_lcu_y, -4, 8); /* ALF one LCU-row */ for (i_lcu_x = 0; i_lcu_x < w_in_lcu; i_lcu_x++) { alf_lcu_block(h, p_alf_param, p_tmp_frm, p_dec_frm, i_lcu_x, i_lcu_y); } } /* --------------------------------------------------------------------------- */ void davs2_alf_init(uint32_t cpuid, ao_funcs_t *fh) { UNUSED_PARAMETER(cpuid); /* init c function handles */ fh->alf_block[0] = alf_filter_block1; fh->alf_block[1] = alf_filter_block2; /* init asm function handles */ #if HAVE_MMX #if HIGH_BIT_DEPTH #else if (cpuid & DAVS2_CPU_SSE4) { fh->alf_block[0] = alf_filter_block_sse128; } #endif #endif } davs2-1.6/source/common/alf.h000066400000000000000000000036761337322544400161250ustar00rootroot00000000000000/* * alf.h * * Description of this file: * ALF functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_ALF_H #define DAVS2_ALF_H #ifdef __cplusplus extern "C" { #endif #define alf_get_buffer_size FPFX(alf_get_buffer_size) size_t alf_get_buffer_size(davs2_t *h); #define alf_init_buffer FPFX(alf_init_buffer) void alf_init_buffer (davs2_t *h); #define alf_lcurow FPFX(alf_lcurow) void alf_lcurow(davs2_t *h, alf_param_t *p_alf_param, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_y); #define alf_read_param FPFX(alf_read_param) void alf_read_param(davs2_t *h, davs2_bs_t *bs); #define davs2_alf_init FPFX(alf_init) void davs2_alf_init(uint32_t cpuid, ao_funcs_t *fh); #ifdef __cplusplus } #endif #endif // DAVS2_ALF_H davs2-1.6/source/common/bitstream.cc000066400000000000000000000214231337322544400175010ustar00rootroot00000000000000/* * bitstream.cc * * Description of this file: * Bitstream functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "aec.h" #include "bitstream.h" /* --------------------------------------------------------------------------- * start code (in 32-bit) */ #define SEQENCE_START_CODE 0xB0010000 #define I_FRAME_START_CODE 0xB3010000 #define PB_FRAME_START_CODE 0xB6010000 /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ void bs_init(davs2_bs_t *bs, uint8_t *p_data, int i_data) { bs->p_stream = p_data; bs->i_stream = i_data; bs->i_bit_pos = 0; } /* --------------------------------------------------------------------------- * align position in bitstream */ void bs_align(davs2_bs_t *bs) { bs->i_bit_pos = ((bs->i_bit_pos + 7) >> 3) << 3; } /* --------------------------------------------------------------------------- */ int bs_left_bytes(davs2_bs_t *bs) { return (bs->i_stream - (bs->i_bit_pos >> 3)); } /* --------------------------------------------------------------------------- * Function : try to find slice header in next forward bytes * Parameters : * [in ] : bs_data - pointer to the bit-stream data buffer * Return : TRUE for slice header, otherwise FALSE * --------------------------------------------------------------------------- */ int found_slice_header(davs2_bs_t *bs) { int num_bytes = 4; for (; num_bytes; num_bytes--) { uint8_t *data = bs->p_stream + ((bs->i_bit_pos + 7) >> 3); uint32_t code = *(uint32_t *)data; if ((code & 0x00FFFFFF) == 0x00010000 && ((code >> 24) <= SC_SLICE_CODE_MAX)) { return 1; } bs->i_bit_pos += 8; } return 0; } /* --------------------------------------------------------------------------- */ int bs_get_start_code(davs2_bs_t *bs) { uint8_t *p_data = bs->p_stream + ((bs->i_bit_pos + 7) >> 3); int i_left_bytes = bs_left_bytes(bs); int i_used_bytes = 0; /* find the start code '00 00 01 xx' */ while (i_left_bytes >= 4 && (*(uint32_t *)p_data & 0x00FFFFFF) != 0x00010000) { p_data++; i_left_bytes--; i_used_bytes++; } if (i_left_bytes >= 4) { bs->i_bit_pos += (i_used_bytes << 3); return p_data[3]; } else { return -1; } } /* --------------------------------------------------------------------------- * Function : check bitstream & dispose the pseudo start code * Parameters : * [in] : dst - pointer to dst byte buffer * [in/out] : src - pointer to source byte buffer * [in/out] : i_src - byte number of src * Return : byte number of dst * --------------------------------------------------------------------------- */ int bs_dispose_pseudo_code(uint8_t *dst, uint8_t *src, int i_src) { static const int BITMASK[] = { 0x00, 0x00, 0xc0, 0x00, 0xf0, 0x00, 0xfc, 0x00 }; int b_found_start_code = 0; int leading_zeros = 0; int last_bit_count = 0; int curr_bit_count = 0; int b_dispose = 0; int i_pos = 0; int i_dst = 0; uint8_t last_byte = 0; uint8_t curr_byte = 0; /* checking... */ while (i_pos < i_src) { curr_byte = src[i_pos++]; curr_bit_count = 8; switch (curr_byte) { case 0: if (b_found_start_code) { b_dispose = 1; /* start code of first slice: [00 00 01 00] */ b_found_start_code = 0; } leading_zeros++; break; case 1: if (leading_zeros >= 2) { /* find start code: [00 00 01] */ b_found_start_code = 1; if (last_bit_count) { /* terminate the fixing work before new start code */ last_bit_count = 0; dst[i_dst++] = 0; /* insert the dispose byte */ } } leading_zeros = 0; break; case 2: if (b_dispose && leading_zeros == 2) { /* dispose the pseudo code, two bits */ curr_bit_count = 6; } leading_zeros = 0; break; default: if (b_found_start_code) { if (curr_byte == SC_SEQUENCE_HEADER || curr_byte == SC_USER_DATA || curr_byte == SC_EXTENSION) { b_dispose = 0; } else { b_dispose = 1; } b_found_start_code = 0; } leading_zeros = 0; break; } if (curr_bit_count == 8) { if (last_bit_count == 0) { dst[i_dst++] = curr_byte; } else { dst[i_dst++] = ((last_byte & BITMASK[last_bit_count]) | ((curr_byte & BITMASK[8 - last_bit_count]) >> last_bit_count)); last_byte = (curr_byte << (8 - last_bit_count)) & BITMASK[last_bit_count]; } } else { if (last_bit_count == 0) { last_byte = curr_byte; last_bit_count = curr_bit_count; } else { dst[i_dst++] = ((last_byte & BITMASK[last_bit_count]) | ((curr_byte & BITMASK[8 - last_bit_count]) >> last_bit_count)); last_byte = (curr_byte << (8 - last_bit_count)) & BITMASK[last_bit_count - 2]; last_bit_count = last_bit_count - 2; } } } if (last_bit_count != 0 && last_byte != 0) { dst[i_dst++] = last_byte; } return i_dst; } // --------------------------------------------------------------------------- // find the first start code in byte stream // return the byte address if found, or NULL on failure const uint8_t * find_start_code(const uint8_t *data, int len) { while (len >= 4 && (*(uint32_t *)data & 0x00FFFFFF) != 0x00010000) { data++; len--; } return len >= 4 ? data : NULL; } // --------------------------------------------------------------------------- // find the first picture or sequence start code in byte stream int32_t find_pic_start_code(uint8_t prevbyte3, uint8_t prevbyte2, uint8_t prevbyte1, const uint8_t *data, int32_t len) { #define ISPIC(x) ((x) == 0xB0 || (x) == 0xB1 || (x) == 0xB3 || (x) == 0xB6 || (x) == 0xB7) const uint8_t *p = NULL; const uint8_t *data0 = data; const int32_t len0 = len; /* check start code: 00 00 01 xx */ if (/*..*/ len >= 1 && (prevbyte3 == 0) && (prevbyte2 == 0) && (prevbyte1 == 1)) { if (ISPIC(data[0])) { return -3; // found start code (position: -3) } } else if (len >= 2 && (prevbyte2 == 0) && (prevbyte1 == 0) && (data[0] == 1)) { if (ISPIC(data[1])) { return -2; // found start code (position: -2) } } else if (len >= 3 && (prevbyte1 == 0) && (data[0] == 0) && (data[1] == 1)) { if (ISPIC(data[2])) { return -1; // found start code (position: -1) } } /* check start code: 00 00 01 xx, ONLY in data buffer */ while (((p = (uint8_t *)find_start_code(data, len)) != NULL) && !ISPIC(p[3])) { len -= (int32_t)(p - data + 4); data = p + 4; } return (int32_t)(p != NULL ? p - data0 : len0 + 1); #undef ISPIC } davs2-1.6/source/common/bitstream.h000066400000000000000000000044451337322544400173500ustar00rootroot00000000000000/* * bitstream.h * * Description of this file: * Bitstream functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_BITSTREAM_H #define DAVS2_BITSTREAM_H #ifdef __cplusplus extern "C" { #endif #include "common.h" #define bs_init FPFX(bs_init) void bs_init(davs2_bs_t *bs, uint8_t *p_data, int i_data); #define bs_align FPFX(bs_align) void bs_align(davs2_bs_t *bs); #define bs_left_bytes FPFX(bs_left_bytes) int bs_left_bytes(davs2_bs_t *bs); #define found_slice_header FPFX(found_slice_header) int found_slice_header(davs2_bs_t *bs); #define bs_get_start_code FPFX(bs_get_start_code) int bs_get_start_code(davs2_bs_t *bs); #define bs_dispose_pseudo_code FPFX(bs_dispose_pseudo_code) int bs_dispose_pseudo_code(uint8_t *dst, uint8_t *src, int i_src); #define find_start_code FPFX(find_start_code) const uint8_t * find_start_code(const uint8_t *data, int len); #define find_pic_start_code FPFX(find_pic_start_code) int32_t find_pic_start_code(uint8_t prevbyte3, uint8_t prevbyte2, uint8_t prevbyte1, const uint8_t *data, int32_t len); #ifdef __cplusplus } #endif #endif // DAVS2_BITSTREAM_H davs2-1.6/source/common/block_info.cc000066400000000000000000000073171337322544400176220ustar00rootroot00000000000000/* * block_info.cc * * Description of this file: * Block-infomation functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE cu_t *get_neighbor_cu_in_slice(davs2_t *h, cu_t *p_cur, int scu_x, int scu_y, int x4x4, int y4x4) { const int shift_4x4 = MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT; if (x4x4 < 0 || y4x4 < 0 || x4x4 >= h->i_width_in_spu || y4x4 >= h->i_height_in_spu) { return NULL; } else if ((scu_x << shift_4x4) <= x4x4 && (scu_y << shift_4x4) <= y4x4) { return p_cur; } else { cu_t *p_neighbor = &h->scu_data[(y4x4 >> 1) * h->i_width_in_scu + (x4x4 >> 1)]; return p_neighbor->i_slice_nr == p_cur->i_slice_nr ? p_neighbor : NULL; } } /* --------------------------------------------------------------------------- * (x_4x4, y_4x4) - ڱ任4x4ַͼ * (scu_x, scu_y) - ǰCUSCUַͼ */ int get_neighbor_cbp_y(davs2_t *h, int x_4x4, int y_4x4, int scu_x, int scu_y, cu_t *p_cu) { cu_t *p_neighbor = get_neighbor_cu_in_slice(h, p_cu, scu_x, scu_y, x_4x4, y_4x4); if (p_neighbor == NULL) { return 0; } else if (p_neighbor->i_trans_size == TU_SPLIT_NON) { return p_neighbor->i_cbp & 1; // TUʱֱӷضӦȿCBP } else { int cbp = p_neighbor->i_cbp; int level = p_neighbor->i_cu_level - MIN_PU_SIZE_IN_BIT; int cu_mask = (1 << level) - 1; x_4x4 &= cu_mask; y_4x4 &= cu_mask; if (p_neighbor->i_trans_size == TU_SPLIT_VER) { // ֱ x_4x4 >>= (level - 2); return (cbp >> x_4x4) & 1; } else if (p_neighbor->i_trans_size == TU_SPLIT_HOR) { // ˮƽ y_4x4 >>= (level - 2); return (cbp >> y_4x4) & 1; } else { // IJ滮 x_4x4 >>= (level - 1); y_4x4 >>= (level - 1); return (cbp >> (x_4x4 + (y_4x4 << 1))) & 1; } } } davs2-1.6/source/common/block_info.h000066400000000000000000000031351337322544400174560ustar00rootroot00000000000000/* * block_info.h * * Description of this file: * Block Infomation functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_BLOCK_INFO_H #define DAVS2_BLOCK_INFO_H #ifdef __cplusplus extern "C" { #endif #define get_neighbor_cbp_y FPFX(get_neighbor_cbp_y) int get_neighbor_cbp_y(davs2_t *h, int xN, int yN, int scu_x, int scu_y, cu_t *p_cu); #ifdef __cplusplus } #endif #endif // DAVS2_BLOCK_INFO_H davs2-1.6/source/common/common.cc000066400000000000000000000276021337322544400170040ustar00rootroot00000000000000/* * common.cc * * Description of this file: * misc common functionsdefinition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include #if __ARM_ARCH_7__ #include #define LOGI(format,...) __android_log_print(ANDROID_LOG_INFO, "davs2",format,##__VA_ARGS__) #endif /** * =========================================================================== * macros * =========================================================================== */ /** * =========================================================================== * global variables * =========================================================================== */ #if HIGH_BIT_DEPTH int max_pel_value = 255; int g_bit_depth = 8; int g_dc_value = 128; #endif /** * =========================================================================== * trace * =========================================================================== */ #if AVS2_TRACE /** * =========================================================================== * trace file * =========================================================================== */ FILE *h_trace = NULL; /* global file handle for trace file */ int g_bit_count = 0; /* global bit count for trace */ /* --------------------------------------------------------------------------- */ int avs2_trace_init(davs2_t *h, char *psz_trace_file) { if (strlen(psz_trace_file) > 0) { /* create or truncate the trace file */ h_trace = fopen(psz_trace_file, "wt"); if (!h_trace) { davs2_log(h, DAVS2_LOG_ERROR, "trace: can't write to trace file"); return -1; } else if (!davs2_is_regular_file(fileno(h_trace))) { davs2_log(h, DAVS2_LOG_ERROR, "trace: incompatible with non-regular file"); return -1; } } return 0; } /* --------------------------------------------------------------------------- */ void avs2_trace_destroy(void) { if (h_trace) { fclose(h_trace); } } /* --------------------------------------------------------------------------- */ int avs2_trace(const char *psz_fmt, ...) { int len = 0; /* append to the trace file */ if (h_trace) { va_list arg; va_start(arg, psz_fmt); len = vfprintf(h_trace, psz_fmt, arg); fflush(h_trace); va_end(arg); } return len; } void avs2_trace_string(char *trace_string, int value, int len) { int i, chars; avs2_trace("@"); chars = avs2_trace("%i", g_bit_count); while (chars++ < 6) { avs2_trace(" "); } chars += avs2_trace("%s", trace_string); while (chars++ < 55) { avs2_trace(" "); } // align bit-pattern if (len < 15) { for (i = 0; i < 15 - len; i++) { avs2_trace(" "); } } g_bit_count += len; while (len >= 32) { for (i = 0; i < 8; i++) { avs2_trace("0"); } len -= 8; } // print bit-pattern for (i = 0; i < len; i++) { if (0x01 & (value >> (len - i - 1))) { avs2_trace("1"); } else { avs2_trace("0"); } } avs2_trace(" (%3d)\n", value); } /* --------------------------------------------------------------------------- * write out a trace string to the trace file */ void avs2_trace_string2(char *trace_string, int bit_pattern, int value, int len) { int i, chars; avs2_trace("@"); chars = avs2_trace("%i", g_bit_count); while (chars++ < 6) { avs2_trace(" "); } chars += avs2_trace("%s", trace_string); while (chars++ < 55) { avs2_trace(" "); } // align bit-pattern if (len < 15) { for (i = 0; i < 15 - len; i++) { avs2_trace(" "); } } // print bit-pattern g_bit_count += len; for (i = 1; i <= len; i++) { if ((bit_pattern >> (len - i)) & 0x1) { avs2_trace("1"); } else { avs2_trace("0"); } } avs2_trace(" (%3d)\n", value); } #endif /* --------------------------------------------------------------------------- */ int xl_init(xlist_t *const xlist) { if (xlist == NULL) { return -1; } /* set list empty */ xlist->p_list_head = NULL; xlist->p_list_tail = NULL; /* set node number */ xlist->i_node_num = 0; /* create lock and conditions */ if (davs2_thread_mutex_init(&xlist->list_mutex, NULL) < 0 || davs2_thread_cond_init(&xlist->list_cond, NULL) < 0) { davs2_log(NULL, DAVS2_LOG_ERROR, "Failed to init lock for xl_init()"); return -1; } return 0; } /* --------------------------------------------------------------------------- */ void xl_destroy(xlist_t *const xlist) { if (xlist == NULL) { return; } /* destroy lock and conditions */ davs2_thread_mutex_destroy(&xlist->list_mutex); davs2_thread_cond_destroy(&xlist->list_cond); /* clear */ memset(xlist, 0, sizeof(xlist_t)); } /* --------------------------------------------------------------------------- */ void xl_append(xlist_t *const xlist, void *node) { node_t *new_node = (node_t *)node; if (xlist == NULL) { return; /* error */ } new_node->next = NULL; /* set NULL */ davs2_thread_mutex_lock(&xlist->list_mutex); /* lock */ /* append this node */ if (xlist->p_list_tail != NULL) { /* append this node at tail */ xlist->p_list_tail->next = new_node; } else { xlist->p_list_head = new_node; } xlist->p_list_tail = new_node; /* point to the tail node */ xlist->i_node_num++; /* increase the node number */ davs2_thread_mutex_unlock(&xlist->list_mutex); /* unlock */ /* all is done, notify one waiting thread to work */ davs2_thread_cond_signal(&xlist->list_cond); } /* --------------------------------------------------------------------------- */ void *xl_remove_head(xlist_t *const xlist, const int wait) { node_t *node = NULL; if (xlist == NULL) { return NULL; /* error */ } davs2_thread_mutex_lock(&xlist->list_mutex); if (wait && !xlist->i_node_num) { davs2_thread_cond_wait(&xlist->list_cond, &xlist->list_mutex); } /* remove the header node */ if (xlist->i_node_num > 0) { node = xlist->p_list_head; /* point to the header node */ /* modify the list */ xlist->p_list_head = node->next; if (xlist->p_list_head == NULL) { /* there are no any node in this list, reset the tail pointer */ xlist->p_list_tail = NULL; } xlist->i_node_num--; /* decrease the number */ } davs2_thread_mutex_unlock(&xlist->list_mutex); return node; } /* --------------------------------------------------------------------------- */ void *xl_remove_head_ex(xlist_t *const xlist) { node_t *node = NULL; if (xlist == NULL) { return NULL; /* error */ } /* remove the header node */ if (xlist->i_node_num > 0) { node = xlist->p_list_head; /* point to the header node */ /* modify the list */ xlist->p_list_head = node->next; if (xlist->p_list_head == NULL) { /* there are no any node in this list, reset the tail pointer */ xlist->p_list_tail = NULL; } xlist->i_node_num--; /* decrease the number */ } return node; } /** * =========================================================================== * davs2_log * =========================================================================== */ #ifdef _MSC_VER /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void davs2_set_font_color(int color) { static const WORD colors[] = { FOREGROUND_INTENSITY | FOREGROUND_GREEN, // ɫ FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_BLUE, // cyan FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN, // ɫ FOREGROUND_INTENSITY | FOREGROUND_RED, // ɫ FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_BLUE, // ɫ }; SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors[color]); } #endif /* --------------------------------------------------------------------------- */ static void davs2_log_default(int i_log_level, const char *psz_fmt) { #if !defined(_MSC_VER) static const char str_color_clear[] = "\033[0m"; // "\033[0m" static const char str_color[][16] = { /* green cyan yellow red */ "\033[1;32m", "\033[1;36m", "\033[1;33m", "\033[1;31m" }; const char *cur_color = str_color[i_log_level]; #endif static const char *null_prefix = ""; const char *psz_prefix = null_prefix; switch (i_log_level) { case DAVS2_LOG_ERROR: psz_prefix = "[davs2 error]: "; break; case DAVS2_LOG_WARNING: psz_prefix = "[davs2 warn]: "; break; case DAVS2_LOG_INFO: psz_prefix = "[davs2 info]: "; break; case DAVS2_LOG_DEBUG: psz_prefix = "[davs2 debug]: "; break; default: psz_prefix = "[davs2 *]: "; #if !defined(_MSC_VER) cur_color = str_color[0]; #endif break; } #if defined(_MSC_VER) davs2_set_font_color(i_log_level); /* set color */ fprintf(stderr, "%s%s\n", psz_prefix, psz_fmt); davs2_set_font_color(0); /* restore to white color */ #else fprintf(stderr, "%s%s%s%s\n", cur_color, psz_prefix, psz_fmt, str_color_clear); #endif } /* --------------------------------------------------------------------------- */ void davs2_log(void *handle, int level, const char *format, ...) { davs2_log_t *h = (davs2_log_t *)handle; int i_enable_level = 0; if (h != NULL) { i_enable_level = h->i_log_level; } DAVS2_ASSERT(level >= 0 && level < DAVS2_LOG_MAX, "Invalid log level %d", level); if (level >= i_enable_level) { char message[2048] = { 0 }; if (h != NULL) { sprintf(message, "%s: ", h->module_name); } va_list arg_ptr; va_start(arg_ptr, format); vsprintf(message + strlen(message), format, arg_ptr); va_end(arg_ptr); davs2_log_default(level, message); } } davs2-1.6/source/common/common.h000066400000000000000000001604541337322544400166510ustar00rootroot00000000000000/* * common.h * * Description of this file: * misc common functionsdefinition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_COMMON_H #define DAVS2_COMMON_H #ifdef __cplusplus extern "C" { #endif /** * =========================================================================== * common include files * =========================================================================== */ #include "defines.h" #include "osdep.h" #include "davs2.h" #include #include #include #if (ARCH_X86 || ARCH_X86_64) #include #endif /** * =========================================================================== * basic type defines * =========================================================================== */ #if HIGH_BIT_DEPTH typedef uint16_t pel_t; /* type for pixel value */ typedef uint64_t pel4_t; /* type for 4-pixels value */ typedef int32_t itr_t; /* intra prediction temp */ #else typedef uint8_t pel_t; /* type for pixel value */ typedef uint32_t pel4_t; /* type for 4-pixels value */ typedef int16_t itr_t; /* intra prediction temp */ #endif typedef int16_t coeff_t; /* type for transform coefficient */ typedef int16_t mct_t; /* motion compensation temp*/ typedef uint8_t bool_t; /* type for flag */ typedef struct cu_t cu_t; typedef struct davs2_log_t davs2_log_t; typedef struct davs2_t davs2_t; typedef struct davs2_mgr_t davs2_mgr_t; typedef struct davs2_outpic_t davs2_outpic_t; /** * =========================================================================== * macros * =========================================================================== */ #define IS_HOR_PU_PART(mode) (((1 << (mode)) & MASK_HOR_PU_MODES) != 0) #define IS_VER_PU_PART(mode) (((1 << (mode)) & MASK_VER_PU_MODES) != 0) #define IS_INTRA_MODE(mode) (((1 << (mode)) & MASK_INTRA_MODES ) != 0) #define IS_INTER_MODE(mode) (((1 << (mode)) & MASK_INTER_MODES ) != 0) #define IS_NOSKIP_INTER_MODE(mode) (((1 << (mode)) & MASK_INTER_NOSKIP) != 0) #define IS_SKIP_MODE(mode) ((mode) == PRED_SKIP) #define IS_INTRA(cu) IS_INTRA_MODE((cu)->i_cu_type) #define IS_INTER(cu) IS_INTER_MODE((cu)->i_cu_type) #define IS_NOSKIP_INTER(cu) IS_NOSKIP_INTER_MODE((cu)->i_cu_type) #define IS_SKIP(cu) IS_SKIP_MODE((cu)->i_cu_type) static ALWAYS_INLINE int DAVS2_MAX(int a, int b) { return ((a) > (b) ? (a) : (b)); } static ALWAYS_INLINE int DAVS2_MIN(int a, int b) { return ((a) < (b) ? (a) : (b)); } #define DAVS2_ABS(a) ((a) < 0 ? (-(a)) : (a)) #define DAVS2_CLIP1(a) (pel_t)((a) > max_pel_value ? max_pel_value : ((a) < 0 ? 0 : (a))) static ALWAYS_INLINE int DAVS2_CLIP3(int L, int H, int v) { return (((v) < (L)) ? (L) : (((v) > (H)) ? (H) : (v))); } #define DAVS2_SWAP(x,y) { (y)=(y)^(x); (x)=(y)^(x); (y)=(x)^(y); } #define DAVS2_ALIGN(x, a) (((x) + ((a) - 1)) & (~((a) - 1))) #define LCU_STRIDE (MAX_CU_SIZE) #define LCU_BUF_SIZE (LCU_STRIDE * MAX_CU_SIZE) /* size of LCU buffer size */ /* --------------------------------------------------------------------------- * multi line macros */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define MULTI_LINE_MACRO_BEGIN do { #define MULTI_LINE_MACRO_END \ __pragma(warning(push))\ __pragma(warning(disable:4127))\ } while (0)\ __pragma(warning(pop)) #else #define MULTI_LINE_MACRO_BEGIN { #define MULTI_LINE_MACRO_END } #endif /* --------------------------------------------------------------------------- * memory malloc */ #define CHECKED_MALLOC(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ (var) = (type)davs2_malloc(size);\ if ((var) == NULL) {\ goto fail;\ }\ MULTI_LINE_MACRO_END #define CHECKED_MALLOCZERO(var, type, size) \ MULTI_LINE_MACRO_BEGIN\ CHECKED_MALLOC(var, type, size);\ memset(var, 0, size);\ MULTI_LINE_MACRO_END /** * =========================================================================== * enum defines * =========================================================================== */ /* --------------------------------------------------------------------------- * task status */ enum task_status_t { TASK_FREE = 0, /* task is free, could be used */ TASK_BUSY = 1 /* task busy */ }; /* --------------------------------------------------------------------------- * coding types */ enum coding_type_e { FRAME_CODING = 0, /* frame coding */ FIELD_CODING = 1 /* field coding */ }; /* --------------------------------------------------------------------------- * picture struct */ enum pic_struct_e { FIELD = 0, /* field picture struct */ FRAME = 1 /* frame picture struct */ }; /* --------------------------------------------------------------------------- * slice type */ enum { AVS2_I_SLICE = 0, /* slice type: I frame */ AVS2_P_SLICE = 1, /* slice type: P frame */ AVS2_B_SLICE = 2, /* slice type: B frame */ AVS2_G_SLICE = 3, /* AVSS2 type: G frame, should be output (as I frame) */ AVS2_F_SLICE = 4, /* slice type: F frame */ AVS2_S_SLICE = 5, /* AVSS2 type: S frame */ AVS2_GB_SLICE = 6, /* AVSS2 type: GB frame, should not be output */ }; /* --------------------------------------------------------------------------- * start codes */ enum start_code_e { SC_SEQUENCE_HEADER = 0xB0, /* sequence header start code */ SC_SEQUENCE_END = 0xB1, /* sequence end start code */ SC_USER_DATA = 0xB2, /* user data start code */ SC_INTRA_PICTURE = 0xB3, /* intra picture start code */ SC_EXTENSION = 0xB5, /* extension start code */ SC_INTER_PICTURE = 0xB6, /* inter picture start code */ SC_VIDEO_EDIT_CODE = 0xB7, /* video edit start code */ SC_SLICE_CODE_MIN = 0x00, /* min slice start code */ SC_SLICE_CODE_MAX = 0x8F /* max slice start code */ }; /* --------------------------------------------------------------------------- * all prediction modes (n = N/2) */ enum cu_pred_mode_e { /* all inter modes: 8 */ PRED_SKIP = 0, /* skip/direct block: 1 */ PRED_2Nx2N = 1, /* 2N x 2N block: 1 */ PRED_2NxN = 2, /* 2N x N block: 2 */ PRED_Nx2N = 3, /* N x 2N block: 2 */ PRED_2NxnU = 4, /* 2N x n + 2N x 3n block: 2 */ PRED_2NxnD = 5, /* 2N x 3n + 2N x n block: 2 */ PRED_nLx2N = 6, /* n x 2N + 3n x 2N block: 2 */ PRED_nRx2N = 7, /* 3n x 2N + n x 2N block: 2 */ /* all intra modes: 4 */ PRED_I_2Nx2N = 8, /* 2N x 2N block: 1 */ PRED_I_NxN = 9, /* N x N block: 4 */ PRED_I_2Nxn = 10, /* 2N x n (32x8, 16x4) block: 4 */ PRED_I_nx2N = 11, /* n x 2N (8x32, 4x16) block: 4 */ /* mode numbers */ MAX_PRED_MODES = 12, /* total 12 pred modes, include: */ MAX_INTER_MODES = 8, /* 8 inter modes */ MAX_INTRA_MODES = 4, /* 4 intra modes */ /* masks */ MASK_HOR_TU_MODES = 0x0430, /* mask for horizontal TU partition */ MASK_VER_TU_MODES = 0x08C0, /* mask for vertical TU partition */ MASK_HOR_PU_MODES = 0x0434, /* mask for horizontal PU partition */ MASK_VER_PU_MODES = 0x08C8, /* mask for vertical PU partition */ MASK_INTER_MODES = 0x00FF, /* mask for inter modes */ MASK_INTER_NOSKIP = 0x00FE, /* mask for inter modes except skip */ MASK_INTRA_MODES = 0x0F00 /* mask for intra modes */ }; /* --------------------------------------------------------------------------- * splitting type of transform unit */ enum tu_split_type_e { TU_SPLIT_INVALID = -1, /* invalid split type */ TU_SPLIT_NON = 0, /* not split */ TU_SPLIT_HOR = 1, /* horizontally split into 4 blocks */ TU_SPLIT_VER = 2, /* vertically split into 4 blocks */ TU_SPLIT_CROSS = 3, /* cross split into 4 blocks */ NUM_TU_SPLIT_TYPE = 4 /* number of transform split types */ }; /* --------------------------------------------------------------------------- * pu partition */ enum PU_PART { /* square */ PART_4x4, PART_8x8, PART_16x16, PART_32x32, PART_64x64, /* rectangular */ PART_8x4, PART_4x8, PART_16x8, PART_8x16, PART_32x16, PART_16x32, PART_64x32, PART_32x64, /* asymmetrical (0.75, 0.25) */ PART_16x12, PART_12x16, PART_16x4, PART_4x16, PART_32x24, PART_24x32, PART_32x8, PART_8x32, PART_64x48, PART_48x64, PART_64x16, PART_16x64, /* max number of partitions */ MAX_PART_NUM }; /* --------------------------------------------------------------------------- * DCT pattern */ enum dct_pattern_e { DCT_DEAULT, /* default */ DCT_HALF, /* οϽ1/2ߣ1/4 ǷοΪϽ1/21/2 */ DCT_QUAD, /* οϽ1/4ߣ1/16ǷοΪϽ1/41/4 */ /* max number of DCT pattern */ DCT_PATTERN_NUM }; /* --------------------------------------------------------------------------- * context mode */ enum context_mode_e { INTRA_PRED_VER = 0, /* intra vertical predication */ INTRA_PRED_HOR = 1, /* intra horizontal predication */ INTRA_PRED_DC_DIAG = 2 /* intra DC predication */ }; /* --------------------------------------------------------------------------- * image component index */ enum img_component_index_e { IMG_Y = 0, /* image component: Y */ IMG_U = 1, /* image component: Cb */ IMG_V = 2, /* image component: Cr */ IMG_COMPONENTS = 3 /* number of image components */ }; /* --------------------------------------------------------------------------- * predicate direction for inter frame */ enum inter_pred_direction_e { INVALID_REF = -1, /* invalid */ B_BWD = 0, /* backward */ B_FWD = 1 /* forward */ }; /* --------------------------------------------------------------------------- * neighboring position used in inter coding (MVP) or intra prediction */ enum neighbor_block_pos_e { BLK_TOPLEFT = 0, /* D: top-left block: (x - 1, y - 1) */ BLK_TOP = 1, /* B: top block: (x , y - 1) */ BLK_LEFT = 2, /* A: left block: (x - 1, y ) */ BLK_TOPRIGHT = 3, /* C: top-right block: (x + W , y - 1) */ BLK_TOP2 = 4, /* G: top block: (x + W - 1, y - 1) */ BLK_LEFT2 = 5, /* F: left block: (x - 1, y + H - 1) */ BLK_COLLOCATED = 6, /* Col: mode of temporal neighbor */ NUM_INTER_NEIGHBOR = BLK_COLLOCATED + 1 }; /* --------------------------------------------------------------------------- * neighboring position used in inter coding (MVP) or intra prediction */ enum direct_skip_mode_e { DS_NONE = 0, /* no spatial direct/skip mode */ /* spatial direct/skip mode for B frame */ DS_B_BID = 1, /* skip/direct mode: bi-direction */ DS_B_BWD = 2, /* : backward direction */ DS_B_SYM = 3, /* : symmetrical direction */ DS_B_FWD = 4, /* : forward direction */ /* spatial direct/skip mode for F frame */ DS_DUAL_1ST = 1, /* skip/direct mode: dual 1st */ DS_DUAL_2ND = 2, /* : dual 2nd */ DS_SINGLE_1ST = 3, /* : single 1st */ DS_SINGLE_2ND = 4, /* : single 2st */ /* max number */ DS_MAX_NUM = 5 /* max spatial direct/skip mode number of B or F frames */ }; /* --------------------------------------------------------------------------- */ enum intra_avail_e { MD_I_LEFT = 0, MD_I_TOP = 1, MD_I_LEFT_DOWN = 2, MD_I_TOP_RIGHT = 3, MD_I_TOP_LEFT = 4, MD_I_NUM = 5, #define IS_NEIGHBOR_AVAIL(i_avai, md) ((i_avai) & (1 << (md))) }; /* --------------------------------------------------------------------------- * sao modes */ enum sao_mode_e { SAO_MODE_OFF = 0, /* sao mode: off */ SAO_MODE_MERGE = 1, /* sao mode: merge */ SAO_MODE_NEW = 2 /* sao mode: new */ }; /* --------------------------------------------------------------------------- * sao mode merge types */ enum sao_mode_merge_type_e { SAO_MERGE_LEFT = 0, /* sao merge type: left */ SAO_MERGE_ABOVE = 1, /* sao merge type: above */ NUM_SAO_MERGE_TYPES = 2 /* number of sao merge types */ }; /* --------------------------------------------------------------------------- * sao mode types */ enum sao_mode_type_e { SAO_TYPE_EO_0 = 0, /* sao mode type: EO - 0 */ SAO_TYPE_EO_90 = 1, /* sao mode type: EO - 90 */ SAO_TYPE_EO_135 = 2, /* sao mode type: EO - 135 */ SAO_TYPE_EO_45 = 3, /* sao mode type: EO - 45 */ SAO_TYPE_BO = 4 /* sao mode type: BO */ }; /* --------------------------------------------------------------------------- * sao EO classes * the assignments depended on how you implement the edgeType calculation */ enum sao_EO_classes_e { SAO_CLASS_EO_FULL_VALLEY = 0, SAO_CLASS_EO_HALF_VALLEY = 1, SAO_CLASS_EO_PLAIN = 2, SAO_CLASS_EO_HALF_PEAK = 3, SAO_CLASS_EO_FULL_PEAK = 4, SAO_CLASS_BO = 5, NUM_SAO_OFFSET = 6 }; /* --------------------------------------------------------------------------- * contexts for syntax elements */ #define NUM_CUTYPE_CTX 6 #define NUM_SPLIT_CTX 3 // CU depth #define NUM_INTRA_PU_TYPE_CTX 1 /* Ԥ */ #define NUM_MVD_CTX 3 #define NUM_REF_NO_CTX 3 #define NUM_DELTA_QP_CTX 4 #define NUM_INTER_DIR_CTX 15 #define NUM_INTER_DIR_DHP_CTX 3 #define NUM_DMH_MODE_CTX 12 #define NUM_AMP_CTX 2 #define NUM_C_INTRA_MODE_CTX 3 #define NUM_CTP_CTX 9 #define NUM_INTRA_MODE_CTX 7 #define NUM_TU_SPLIT_CTX 3 #define WPM_NUM 3 #define NUM_DIR_SKIP_CTX 4 /* B Skip mode, F Skip mode */ /* 任ϵ */ #define NUM_BLOCK_TYPES 3 #define NUM_MAP_CTX 11 #define NUM_LAST_CG_CTX_LUMA 6 #define NUM_LAST_CG_CTX_CHROMA 6 #define NUM_SIGCG_CTX_LUMA 2 #define NUM_SIGCG_CTX_CHROMA 1 #define NUM_LAST_POS_CTX_LUMA 48 #define NUM_LAST_POS_CTX_CHROMA 12 #define NUM_COEFF_LEVEL_CTX 40 #define NUM_LAST_CG_CTX (NUM_LAST_CG_CTX_LUMA+NUM_LAST_CG_CTX_CHROMA) #define NUM_SIGCG_CTX (NUM_SIGCG_CTX_LUMA+NUM_SIGCG_CTX_CHROMA) #define NUM_LAST_POS_CTX (NUM_LAST_POS_CTX_LUMA+NUM_LAST_POS_CTX_CHROMA) /* */ #define NUM_SAO_MERGE_FLAG_CTX 3 #define NUM_SAO_MODE_CTX 1 #define NUM_SAO_OFFSET_CTX 2 #define NUM_INTER_DIR_MIN_CTX 2 #define NUM_ALF_LCU_CTX 4 /* adaptive loop filter */ /** * =========================================================================== * struct type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * node */ typedef struct node_t node_t; struct node_t { node_t *next; /* pointer to next node */ }; /* --------------------------------------------------------------------------- * xlist_t */ typedef struct xlist_t { node_t *p_list_head; /* pointer to head of node list */ node_t *p_list_tail; /* pointer to tail of node list */ davs2_thread_cond_t list_cond; /* list condition variable */ davs2_thread_mutex_t list_mutex; /* list mutex lock */ int i_node_num; /* node number in the list */ } xlist_t; #if defined(_MSC_VER) || defined(__ICL) #pragma warning(disable: 4201) // non-standard extension used (nameless struct/union) #endif /* --------------------------------------------------------------------------- * syntax context type */ typedef union context_t { struct { unsigned cycno : 2; // 2 bits unsigned MPS : 1; // 1 bit unsigned LG_PMPS : 11; // 11 bits }; uint16_t v; } context_t; /* --------------------------------------------------------------------------- * syntax context management */ typedef struct context_set_t { /* CU */ context_t cu_type_contexts [NUM_CUTYPE_CTX]; context_t intra_pu_type_contexts [NUM_INTRA_PU_TYPE_CTX]; context_t cu_split_flag [NUM_SPLIT_CTX]; context_t transform_split_flag [NUM_TU_SPLIT_CTX]; context_t shape_of_partition_index [NUM_AMP_CTX]; context_t pu_reference_index [NUM_REF_NO_CTX]; context_t cbp_contexts [NUM_CTP_CTX]; context_t mvd_contexts [2][NUM_MVD_CTX]; /* ֡Ԥ */ context_t pu_type_index [NUM_INTER_DIR_CTX]; // b_pu_type_index[15] = f_pu_type_index[3] + dir_multi_hypothesis_mode[12] context_t b_pu_type_min_index [NUM_INTER_DIR_MIN_CTX]; context_t cu_subtype_index [NUM_DIR_SKIP_CTX]; // B_Skip/B_Direct, F_Skip/F_Direct context_t weighted_skip_mode [WPM_NUM]; context_t delta_qp_contexts [NUM_DELTA_QP_CTX]; /* ֡Ԥ */ context_t intra_luma_pred_mode [NUM_INTRA_MODE_CTX]; context_t intra_chroma_pred_mode [NUM_C_INTRA_MODE_CTX]; /* 任ϵ */ context_t coeff_run [2][NUM_BLOCK_TYPES][NUM_MAP_CTX]; context_t coeff_level [NUM_COEFF_LEVEL_CTX]; context_t last_cg_contexts [NUM_LAST_CG_CTX]; context_t sig_cg_contexts [NUM_SIGCG_CTX]; context_t last_coeff_pos [NUM_LAST_POS_CTX]; /* */ context_t sao_mergeflag_context [NUM_SAO_MERGE_FLAG_CTX]; context_t sao_mode_context [NUM_SAO_MODE_CTX]; context_t sao_offset_context [NUM_SAO_OFFSET_CTX]; context_t alf_lcu_enable_scmodel [NUM_ALF_LCU_CTX * 3]; } context_set_t; /* --------------------------------------------------------------------------- * bitstream */ typedef struct davs2_bs_t { uint8_t *p_stream; /* pointer to the code-buffer */ int i_stream; /* over code-buffer length, byte-oriented */ int i_bit_pos; /* actual position in the code-buffer, bit-oriented */ #if !ARCH_X86_64 int reserved; /* reserved */ #endif } davs2_bs_t; /* --------------------------------------------------------------------------- * SAO parameters for component block */ typedef struct sao_param_t { int modeIdc; // NEW, MERGE, OFF int typeIdc; // NEW: EO_0, EO_90, EO_135, EO_45, BO. MERGE: left, above int startBand; //BO: starting band index int startBand2; int offset[MAX_NUM_SAO_CLASSES]; } sao_param_t; /* --------------------------------------------------------------------------- * SAO parameters for LCU */ typedef struct sao_t { sao_param_t planes[IMG_COMPONENTS]; } sao_t; /* --------------------------------------------------------------------------- * ALF parameters */ typedef struct alf_param_t { int num_coeff; int filters_per_group; int componentID; int filterPattern[ALF_NUM_VARS]; int coeffmulti[ALF_NUM_VARS][ALF_MAX_NUM_COEF]; // ȷ16ףɫȷ1 } alf_param_t; typedef struct alf_var_t { alf_param_t img_param[IMG_COMPONENTS]; int filterCoeffSym[ALF_NUM_VARS][ALF_MAX_NUM_COEF]; int tab_region_coeff_idx[ALF_NUM_VARS]; /* coefficient look-up table for 16 regions */ uint8_t *tab_lcu_region; /* region index look-up table for LCUs */ } alf_var_t; /* --------------------------------------------------------------------------- * reference index */ typedef union ref_idx_t { struct { // nameless struct int8_t r[2]; // ref 1st and 2nd, 4 bit (sign integer) }; uint16_t v; // v = ((r2 << 8) | (r1 & 0xFF)), 16-bit } ref_idx_t; /* --------------------------------------------------------------------------- * motion vector */ typedef union mv_t { struct { // nameless struct int16_t x; // x, low 16-bit int16_t y; // y, high 16-bit }; uint32_t v; // v = ((y << 16) | (x & 0xFFFF)), 32-bit } mv_t; /* --------------------------------------------------------------------------- * coding block */ typedef union cb_t { struct { /* nameless struct */ int8_t x; /* start position (x, in pixel) within current CU */ int8_t y; /* start position (y, in pixel) within current CU */ int8_t w; /* block width (in pixel) */ int8_t h; /* block height (in pixel) */ }; uint32_t v; /* used for fast operation for all components */ } cb_t; /* --------------------------------------------------------------------------- * motion vector */ typedef struct neighbor_inter_t { mv_t mv[2]; /* motion vectors */ int8_t is_available; /* is block available */ int8_t i_dir_pred; /* predict direction */ ref_idx_t ref_idx; /* reference indexes of 1st and 2nd frame */ } neighbor_inter_t; /* --------------------------------------------------------------------------- */ typedef struct aec_t { ALIGN32(uint8_t *p_buffer); uint64_t i_byte_buf; int i_byte_pos; int i_bytes; int8_t i_bits_to_go; bool_t b_bit_error; /* bit error in stream */ bool_t b_val_bound; bool_t b_val_domain; // is value in R domain 1 is R domain 0 is LG domain uint32_t i_s1; uint32_t i_t1; uint32_t i_value_s; uint32_t i_value_t; /* context */ context_set_t syn_ctx; // pointer to struct of context models #if AVS2_TRACE /* --------------------------------------------------------------------------- * syntax element */ #define TRACESTRING_SIZE 128 // size of trace string char tracestring[TRACESTRING_SIZE]; // trace string #endif // AVS2_TRACE } aec_t; /* --------------------------------------------------------------------------- * reference picture set (RPS) */ typedef struct rps_t { int ref_pic[AVS2_MAX_REFS]; /* delta COI of ref pic */ int remove_pic[8]; /* delta COI of removed pic */ int num_of_ref; /* number of reference picture */ int num_to_remove; /* number of removed picture */ int refered_by_others; /* referenced by others */ int reserved; /* reserved 4 bytes */ } rps_t; /* --------------------------------------------------------------------------- * sequence set information */ typedef struct davs2_seq_t { int valid_flag; /* is this sequence header valid ? */ davs2_seq_info_t head; /* sequence header information (output) */ int sample_precision; /* sample precision */ int encoding_precision; /* encoding precision */ int bit_rate_lower; /* bitrate (lower) */ int bit_rate_upper; /* bitrate (upper) */ int i_enc_width; /* sequence encoding width */ int i_enc_height; /* sequence encoding height */ int log2_lcu_size; /* largest coding block size */ bool_t b_field_coding; /* field coded sequence? */ bool_t b_temporal_id_exist; /* temporal id exist flag */ bool_t enable_weighted_quant; /* weight quant enable */ bool_t enable_background_picture;/* background picture enabled? */ bool_t enable_mhp_skip; /* mhpskip enabled? */ bool_t enable_dhp; /* dhp enabled? */ bool_t enable_wsm; /* wsm enabled? */ bool_t enable_amp; /* AMP(asymmetric motion partitions) enabled? */ bool_t enable_nsqt; /* use NSQT? */ bool_t enable_sdip; /* use SDIP? */ bool_t enable_2nd_transform; /* secondary transform enabled? */ bool_t enable_sao; /* SAO enabled? */ bool_t enable_alf; /* ALF enabled? */ bool_t enable_pmvr; /* PMVR enabled? */ bool_t cross_loop_filter_flag; /* cross loop filter flag */ int picture_reorder_delay; /* picture reorder delay */ int num_of_rps; /* rps set number */ rps_t seq_rps[AVS2_GOP_NUM]; /* RPS at sequence level */ int16_t seq_wq_matrix[2][64]; /* sequence base weighting quantization matrix */ } davs2_seq_t; /* --------------------------------------------------------------------------- * davs2_frame_t */ typedef struct davs2_frame_t { /* properties */ int64_t i_pts; /* user pts (presentation time stamp) */ int64_t i_dts; /* user dts (decoding time stamp) */ int i_type; /* frame type */ int i_qp; int i_chroma_format; /* chroma format (for function davs2_write_a_frame) */ int i_output_bit_depth; /* output bit depth (for function davs2_write_a_frame) */ int i_sample_bit_depth; /* sample bit depth (for function davs2_write_a_frame) */ int frm_decode_error; /* is there any decoding error in this frame? */ int dist_refs[AVS2_MAX_REFS]; /* distance of reference frames, used for MV scaling */ int dist_scale_refs[AVS2_MAX_REFS]; /* = (MULTI / dist_refs) */ int i_poc; /* POC (picture order count), used for MV scaling */ int i_coi; /* COI (coding order index) */ int b_refered_by_others; /* referenced by others */ /* planes */ int i_plane; /* number of planes */ int i_width[3]; /* width for Y/U/V */ int i_lines[3]; /* height for Y/U/V */ int i_stride[3]; /* stride for Y/U/V */ /* parallel */ uint32_t i_ref_count; /* the reference count, DO NOT move its position in this struct */ int i_disposable; /* what to do with the frame when the reference count is decreased to 0? */ /* 0: do nothing, 1: clean the frame, 2: free the frame */ /* frames with 'i_disposable' greater than 0 should NOT be referenced. */ int is_self_malloc; /* is the buffer allocated by itself */ volatile int i_decoded_line; /* latest lcu line that finished reconstruction */ volatile int i_parsed_lcu_xy; /* parsed number of LCU */ int i_conds; /* number conds */ davs2_thread_cond_t cond_aec; /* signal of AEC decoding */ davs2_thread_cond_t *conds_lcu_row; /* [LCU lines] */ int *num_decoded_lcu_in_row; /* number of LCUs decoded in a row */ davs2_thread_mutex_t mutex_frm; /* the mutex */ davs2_thread_mutex_t mutex_recon; /* mutex of reconstruction threads */ /* buffers */ pel_t *planes[3]; /* pointers to Y/U/V data buffer */ int8_t *refbuf; /* pointers to reference index buffer */ mv_t *mvbuf; /* pointers to motion vector buffer*/ } davs2_frame_t; /* --------------------------------------------------------------------------- * weighting quantization */ typedef struct weighted_quant_t { int pic_wq_data_index; int wq_param; int wq_model; int16_t quant_param_undetail[6]; int16_t quant_param_detail[6]; int16_t cur_wq_matrix[4][64]; // [matrix_id][coef] int16_t wq_matrix[2][2][64]; // [matrix_id][detail/undetail][coef] int16_t seq_wq_matrix[2][64]; int16_t pic_user_wq_matrix[2][64]; int16_t wquant_param[2][6]; } weighted_quant_t; /* --------------------------------------------------------------------------- * Run-Level pair */ typedef struct runlevel_pair_t { int16_t run; int16_t level; } runlevel_pair_t; /* --------------------------------------------------------------------------- * Run-Level info */ typedef struct runlevel_t { ALIGN32(runlevel_pair_t run_level[16]); /* 任ϵΪ32x32 */ int num_nonzero_cg; // number of CGs with non-zero coefficients uint32_t reserved; /* contexts pointer */ context_t(*p_ctx_run)[NUM_MAP_CTX]; context_t *p_ctx_level; context_t *p_ctx_sig_cg; context_t *p_ctx_last_cg; context_t *p_ctx_last_pos_in_cg; const int16_t(*avs_scan)[2]; const int16_t(*cg_scan)[2]; coeff_t *p_res; int i_res; int b_swap_xy; int num_cg; int i_tu_level; int w_tr; int h_tr; } runlevel_t; /* --------------------------------------------------------------------------- * LCU reconstruction info */ typedef struct lcu_rec_info_t { ALIGN32(coeff_t coeff_buf_y[LCU_BUF_SIZE]); ALIGN32(coeff_t coeff_buf_uv[2][LCU_BUF_SIZE >> 2]); } lcu_rec_info_t; /* --------------------------------------------------------------------------- * LCU info */ typedef struct lcu_info_t { #if CTRL_AEC_THREAD lcu_rec_info_t rec_info; #endif sao_t sao_param; /* SAO param for each LCU */ uint8_t enable_alf[IMG_COMPONENTS]; /* ALF enabled for each LCU */ } lcu_info_t; /* --------------------------------------------------------------------------- * coding unit */ struct cu_t { /* ------------------------------------------------------------- * variables needed for neighboring CU decoding */ int8_t i_cu_level; int8_t i_cu_type; int8_t i_slice_nr; int8_t i_qp; int8_t i_cbp; int8_t i_trans_size; /* tu_split_type_e */ /* ------------------------------------------------------------- */ int8_t i_weighted_skipmode; int8_t i_md_directskip_mode; int8_t c_ipred_mode; /* chroma intra prediction mode */ int8_t i_dmh_mode; /* dir_multi_hypothesis_mode */ int8_t num_pu; /* number of prediction units */ /* ------------------------------------------------------------- * buffers */ int8_t b8pdir[4]; int8_t intra_pred_modes[4]; int8_t dct_pattern[6]; /* DCT pattern of each block, dct_pattern_e, 4 luma + 2 chroma blocks */ mv_t mv[4][2]; /* [block_idx][1st/2nd] */ ref_idx_t ref_idx[4]; /* [block_idx].r[1st/2nd] */ cb_t pu[4]; /* used to reserve the size of PUs */ }; #include "primitives.h" /* get partition index for the given size */ extern const uint8_t g_partition_map_tab[]; #define PART_INDEX(w, h) (g_partition_map_tab[((((w) >> 2) - 1) << 4) + ((h) >> 2) - 1]) /* --------------------------------------------------------------------------- * output picture */ struct davs2_outpic_t { ALIGN16(void *magic); /* must be the 1st member variable. do not change it */ davs2_frame_t *frame; /* the source frame */ davs2_seq_info_t *head; /* sequence head used to decode the frame */ davs2_picture_t *pic; /* the output picture */ davs2_outpic_t *next; /* next node */ }; /* --------------------------------------------------------------------------- * output picture list */ typedef struct davs2_output_t { int output; /* output index of the next frame */ int busy; /* whether possibly one frame is being delivered */ int num_output_pic; /* number of pictures to be output */ davs2_outpic_t *pics; /* output pictures */ } davs2_output_t; /* --------------------------------------------------------------------------- * assemble elementary stream to a complete decodable unit (e.g., one frame), * the complete decodable unit is called ES unit */ typedef struct es_unit_t { ALIGN16(void *magic); /* must be the 1st member variable. do not change it */ davs2_bs_t bs; /* bit-stream reader of this es_unit */ int64_t pts; /* presentation time stamp */ int64_t dts; /* decoding time stamp */ int len; /* length of valid data in byte stream buffer */ int size; /* buffer size */ uint8_t data[1]; /* byte stream buffer */ } es_unit_t; /* --------------------------------------------------------------------------- * decoder task */ typedef struct davs2_task_t { ALIGN32(int task_id); /* task id */ int task_status; /* 0: free; 1, busy */ davs2_mgr_t *taskmgr; /* the taskmgr */ es_unit_t *curr_es_unit; /* decoding ES unit */ davs2_thread_t thread_decode; /* handle of the decoding thread */ } davs2_task_t; /* --------------------------------------------------------------------------- */ struct davs2_log_t { int i_log_level; /* log level */ char module_name[60]; /* module name */ }; /* --------------------------------------------------------------------------- * decoder manager */ struct davs2_mgr_t { davs2_log_t module_log; /* log module */ volatile int b_exit; /* app signal to exit */ volatile int b_flushing; /* is being flushing */ davs2_param_t param; /* decoder param */ es_unit_t *es_unit; /* next input ES unit pointer */ davs2_seq_t seq_info; /* latest sequence head */ int i_tr_wrap_cnt;/* COI wrap count */ int i_prev_coi; /* previous COI */ /* --- decoder output --------- */ int new_sps; /* is SPS(sequence property set) changed? */ int num_frames_to_output; /* --- decoding picture buffer (DBP) --------- */ davs2_frame_t **dpb; /* decoded picture buffer array */ int dpbsize; /* size of the dpb array */ /* --- frames to be removed before next frame decoding --------- */ int num_frames_to_remove; /* number of frames to be removed */ int coi_remove_frame[8]; /* COI of frames to be removed */ /* --- lists (input & output) ---------------------------------- */ xlist_t packets_idle; /* bit-stream: free buffers for input packets */ xlist_t pic_recycle; /* output_picture: free pictures recycle bin */ davs2_output_t outpics; /* output pictures */ /* --- task ---------------------------------------------------- */ int num_decoders; /* number of decoders in total */ int num_active_decoders; /* number of active decoders currently */ davs2_t *decoders; /* frame decoder contexts */ davs2_t *h_dec; /* decoder context for current input bitstream */ int num_frames_in; /* number of frames: input */ int num_frames_out; /* number of frames: output */ /* --- thread control ------------------------------------------ */ int num_total_thread; /* number of decoding threads in total */ int num_aec_thread; /* number of threads for AEC coding (the others are for reconstruction) */ int num_rec_thread; /* use thread pool or not */ davs2_thread_t thread_output; /* handle of the frame output thread */ davs2_thread_mutex_t mutex_mgr; /* a non-recursive mutex */ davs2_thread_mutex_t mutex_aec; /* a non-recursive mutex for AEC */ void *thread_pool; /* AEC encoding thread */ }; /* --------------------------------------------------------------------------- */ typedef struct davs2_row_rec_t { davs2_t *h; /* frame decoder handler */ lcu_info_t *lcu_info; /* LCU info for REC */ lcu_rec_info_t *p_rec_info; /* LCu reconstruction info */ int idx_cu_zscan; /* current CU scan order */ bool_t b_block_avail_top; /* availability of top block, used in second transform */ bool_t b_block_avail_left; /* availability of left block, used in second transform */ /* LCU position */ struct ctu_recon_t { int i_pix_x; int i_pix_y; int i_pix_x_c; int i_pix_y_c; int i_scu_x; int i_scu_y; int i_scu_xy; int i_spu_x; int i_spu_y; int i_ctu_w; /* width of CTU in luma */ int i_ctu_h; /* height of CTU in luma */ int i_ctu_w_c; /* width of CTU in chroma */ int i_ctu_h_c; /* height of CTU in chroma */ /* buffer pointers to picture */ int i_frec[3]; /* stride of reconstruction buffer (reconstruction picture) */ pel_t *p_frec[3]; /* reconstruction buffer pointer (reconstruction picture) */ /* buffer pointers to CTU cache */ int i_fdec[3]; /* stride of reconstruction buffer (current LCU) */ pel_t *p_fdec[3]; /* reconstruction buffer pointer (current LCU) */ } ctu; // CTU info /* buffers */ ALIGN32(pel_t buf_edge_pixels[MAX_CU_SIZE << 3]); /* intra predication buffer */ ALIGN32(pel_t pred_blk[LCU_BUF_SIZE]); /* temporary buffer used for prediction */ // ALIGN32(pel_t fdec_buf[MAX_CU_SIZE * (MAX_CU_SIZE + (MAX_CU_SIZE >> 1))]); struct lcu_intra_border_t { ALIGN32(pel_t rec_left[MAX_CU_SIZE]); /* Left border of current LCU */ ALIGN32(pel_t rec_top[MAX_CU_SIZE * 2 + 32]); /* top-left, top and top-right samples (Reconstruction) of current LCU */ } ctu_border[IMG_COMPONENTS]; /* Y, U, V components */ } davs2_row_rec_t; /* --------------------------------------------------------------------------- */ struct davs2_t { davs2_log_t module_log; /* log module */ /* ------------------------------------------------------------- * task information */ davs2_task_t task_info; /* task information */ /* ------------------------------------------------------------- * sequence */ davs2_seq_t seq_info; /* sequence head of this task */ /* ------------------------------------------------------------- * log */ int i_log_level; /* log level */ int i_image_width; /* decoded image width */ int i_image_height; /* decoded image height */ int i_chroma_format; /* chroma format(1: 4:2:0, 2: 4:2:2) */ int i_lcu_level; /* LCU size in bit */ int i_lcu_size; /* LCU size = 1 << i_lcu_level */ int i_lcu_size_sub1; /* LCU size = (1 << i_lcu_level) - 1 */ int i_display_delay; /* picture display delay */ int sample_bit_depth; /* sample bit depth */ int output_bit_depth; /* output bit depth (assuming: output_bit_depth <= sample_bit_depth) */ bool_t b_bkgnd_picture; /* background picture enabled? */ bool_t b_ra_decodable; /* random access decodable flag */ bool_t b_video_edit_code; /* video edit code */ /* ------------------------------------------------------------- * coding tools enabled */ bool_t b_roi; bool_t b_DQP; /* using DQP? */ bool_t b_sao; bool_t b_alf; // int b_dmh; /* ------------------------------------------------------------- * decoding */ davs2_bs_t *p_bs; /* input bitstream pointer */ aec_t aec; /* arithmetic entropy decoder */ int decoding_error; /* ֵʾ˽ */ /* ------------------------------------------------------------- * field */ bool_t b_top_field_first; bool_t b_repeat_first_field; bool_t b_top_field; /* ------------------------------------------------------------- * picture coding type */ int8_t i_frame_type; int8_t i_pic_coding_type; int8_t i_pic_struct; /* frame or field coding */ /* ------------------------------------------------------------- * picture properties */ int i_width; /* picture width in pixel (luma) */ int i_height; /* picture height in pixel (luma) */ int i_width_in_scu; /* width in SCU */ int i_height_in_scu; /* height in SCU */ int i_size_in_scu; /* number of SCU */ int i_width_in_spu; /* width in SPU */ int i_height_in_spu; /* height in SPU */ int i_width_in_lcu; /* width in LCU */ int i_height_in_lcu; /* height in LCU */ int i_picture_qp; int i_qp; /* quant for the current frame */ int i_poc; /* POC (picture order count) of current frame, 8 bit */ int i_coi; /* COI (coding order index) */ int i_cur_layer; int chroma_quant_param_delta_u; int chroma_quant_param_delta_v; bool_t b_fixed_picture_qp; bool_t b_bkgnd_reference; /* AVS2-S: background reference enabled? */ bool_t enable_chroma_quant_param; /* ------------------------------------------------------------- * slice */ bool_t b_slice_checked; /* is slice checked? */ bool_t b_fixed_slice_qp; int i_slice_index; /* current slice index */ int i_slice_qp; int i_last_dquant; pel_t *intra_border[3]; /* buffer for store decoded bottom pixels of the top lcu row (before filter) */ /* ------------------------------------------------------------- * reference frame */ int num_of_references; rps_t rps; davs2_frame_t *fref[AVS2_MAX_REFS]; davs2_frame_t *fdec; davs2_frame_t *f_background_cur; /* background reference frame, used for reconstruction */ davs2_frame_t *f_background_ref; /* background_frame, used for reference */ davs2_frame_t *p_frame_sao; /* used for SAO */ davs2_frame_t *p_frame_alf; /* used for ALF */ lcu_info_t *lcu_infos; /* LCU level info */ /* ------------------------------------------------------------- * post processing */ /* deblock */ int b_loop_filter; /* loop filter enabled? */ int i_alpha_offset; int i_beta_offset; int alpha; int alpha_c; int beta; int beta_c; /* ALF */ alf_var_t *p_alf; bool_t pic_alf_on[IMG_COMPONENTS]; /* SAO */ bool_t slice_sao_on[IMG_COMPONENTS]; /* ------------------------------------------------------------- * buffers */ uint8_t *p_integral; /* holder: base pointer for all allocated memory */ /* intra mode */ int i_ipredmode; /* stride */ int8_t *p_ipredmode; /* intra prediction mode buffer */ /* scu */ cu_t *scu_data; /* ref & mv & inter prediction direction */ int8_t *p_dirpred; /* inter prediction direction */ ref_idx_t *p_ref_idx; /* reference index */ mv_t *p_tmv_1st; /* motion vector of 4x4 block (1st reference) */ mv_t *p_tmv_2nd; /* motion vector of 4x4 block (2nd reference) */ /* loop filter */ uint8_t *p_deblock_flag[2]; /* [v/h][b8_x, b8_y] */ /* ------------------------------------------------------------- * block availability */ const int8_t *p_tab_TR_avail; const int8_t *p_tab_DL_avail; /* ------------------------------------------------------------- * LCU-based cache */ struct lcu_t { /* geometrical properties */ ALIGN32(int i_pix_width); /* actual width (in pixel) for current lcu */ int i_pix_height; /* actual height (in pixel) for current lcu */ int i_scu_x; /* horizontal position for the first SCU in lcu */ int i_scu_y; /* vertical position for the first SCU in lcu */ int i_scu_xy; /* position for the first SCU in lcu */ int i_spu_x; /* horizontal position for the first SPU in lcu */ int i_spu_y; /* vertical position for the first SPU in lcu */ int i_pix_x; /* horizontal position (in pixel) of lcu (luma) */ int i_pix_y; /* vertical position (in pixel) of lcu (luma) */ int i_pix_c_x; /* horizontal position (in pixel) of lcu (chroma) */ int i_pix_c_y; /* vertical position (in pixel) of lcu (chroma) */ int idx_cu_zscan_aec; /* Z-scan index of current AEC CU within LCU (in 8x8 unit) */ /* buffer pointers */ lcu_info_t *lcu_aec; /* LCU info for AEC */ int8_t i_left_cu_qp; /* QP of left CU (for current CU decoding) */ int8_t c_ipred_mode_ctx; /* context of chroma intra prediction mode (for current CU decoding) */ neighbor_inter_t neighbor_inter[NUM_INTER_NEIGHBOR]; /* neighboring inter modes of 4x4 blocks*/ int8_t ref_skip_1st[DS_MAX_NUM]; int8_t ref_skip_2nd[DS_MAX_NUM]; mv_t mv_tskip_1st[DS_MAX_NUM]; mv_t mv_tskip_2nd[DS_MAX_NUM]; #if !CTRL_AEC_THREAD lcu_rec_info_t rec_info; #endif ALIGN32(runlevel_t cg_info); } lcu; /* ------------------------------------------------------------- * adaptive frequency weighting quantization */ weighted_quant_t wq; // weight quant parameters }; /** * =========================================================================== * global variables * =========================================================================== */ #if HIGH_BIT_DEPTH extern int max_pel_value; extern int g_bit_depth; extern int g_dc_value; #else static const int g_bit_depth = BIT_DEPTH; static const int max_pel_value = (1 << BIT_DEPTH) - 1; static const pel_t g_dc_value = 128; #endif /** * =========================================================================== * common function declares * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : output information * Parameters : * [in] : decoder - decoder handle * Return : none * --------------------------------------------------------------------------- */ void davs2_log(void *h, int level, const char *format, ...); /* --------------------------------------------------------------------------- * trace */ #if AVS2_TRACE int avs2_trace_init(char *psz_trace_file); void avs2_trace_destroy(void); int avs2_trace(const char *psz_fmt, ...); void avs2_trace_string(char *trace_string, int value, int len); void avs2_trace_string2(char *trace_string, int bit_pattern, int value, int len); #endif /* --------------------------------------------------------------------------- * memory alloc */ static ALWAYS_INLINE void *davs2_malloc(size_t i_size) { intptr_t mask = CACHE_LINE_SIZE - 1; uint8_t *align_buf = NULL; uint8_t *buf = (uint8_t *)malloc(i_size + mask + sizeof(void **)); if (buf != NULL) { align_buf = buf + mask + sizeof(void **); align_buf -= (intptr_t)align_buf & mask; *(((void **)align_buf) - 1) = buf; } else { #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L davs2_log(NULL, DAVS2_LOG_ERROR, "malloc of size %zu failed\n", i_size); #else davs2_log(NULL, DAVS2_LOG_ERROR, "malloc of size %lu failed\n", i_size); #endif } return align_buf; } static ALWAYS_INLINE void *davs2_calloc(size_t count, size_t size) { void *p = davs2_malloc(count * size); if (p != NULL) { memset(p, 0, size * sizeof(uint8_t)); } return p; } static ALWAYS_INLINE void davs2_free(void *ptr) { if (ptr != NULL) { free(*(((void **)ptr) - 1)); } } #if SYS_WINDOWS #define WIN32_LEAN_AND_MEAN #include #endif #include /* --------------------------------------------------------------------------- * get timestamp in us */ static ALWAYS_INLINE int64_t davs2_get_us(void) { #if SYS_WINDOWS LARGE_INTEGER nFreq; if (QueryPerformanceFrequency(&nFreq)) { // طʾӲָ֧߾ȼ LARGE_INTEGER t1; QueryPerformanceCounter(&t1); return (int64_t)(1000000 * t1.QuadPart / (double)nFreq.QuadPart); } else { // Ӳ֧£ʹú뼶ϵͳʱ int64_t tm = clock(); return (tm * (1000000 / CLOCKS_PER_SEC)); } #else int64_t tm = clock(); return (tm * (1000000 / CLOCKS_PER_SEC)); #endif } /** * =========================================================================== * inline function defines * =========================================================================== */ #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define davs2_clz(x) __builtin_clz(x) #define davs2_ctz(x) __builtin_ctz(x) #elif defined(_MSC_VER) && defined(_WIN32) static int ALWAYS_INLINE davs2_clz(const uint32_t x) { DWORD r; _BitScanReverse(&r, (DWORD)x); return (r ^ 31); } static int ALWAYS_INLINE davs2_ctz(const uint32_t x) { DWORD r; _BitScanForward(&r, (DWORD)x); return r; } #else static int ALWAYS_INLINE davs2_clz(uint32_t x) { static uint8_t lut[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; int y, z = (((x >> 16) - 1) >> 27) & 16; x >>= z ^ 16; z += y = ((x - 0x100) >> 28) & 8; x >>= y ^ 8; z += y = ((x - 0x10) >> 29) & 4; x >>= y ^ 4; return z + lut[x]; } static int ALWAYS_INLINE davs2_ctz(uint32_t x) { static uint8_t lut[16] = { 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; int y, z = (((x & 0xffff) - 1) >> 27) & 16; x >>= z; z += y = (((x & 0xff) - 1) >> 28) & 8; x >>= y; z += y = (((x & 0xf) - 1) >> 29) & 4; x >>= y; return z + lut[x & 0xf]; } #endif static ALWAYS_INLINE pel_t davs2_clip_pixel(int x) { return (pel_t)((x & ~max_pel_value) ? (-x) >> 31 & max_pel_value : x); } static ALWAYS_INLINE int davs2_clip3(int v, int i_min, int i_max) { return ((v < i_min) ? i_min : (v > i_max) ? i_max : v); } static ALWAYS_INLINE int davs2_median(int a, int b, int c) { int t = (a - b) & ((a - b) >> 31); a -= t; b += t; b -= (b - c) & ((b - c) >> 31); b += (a - b) & ((a - b) >> 31); return b; } // ֵķλ-1򷵻1 static ALWAYS_INLINE int davs2_sign2(int val) { return ((val >> 31) << 1) + 1; } // ֵķλ-10ֵ01 static ALWAYS_INLINE int davs2_sign3(int val) { return (val >> 31) | (int)(((uint32_t)-val) >> 31u); } // log2ֵ01ʱ0log2(val) #define davs2_log2u(val) davs2_ctz(val) /* --------------------------------------------------------------------------- * unions for type-punning. * Mn: load or store n bits, aligned, native-endian * CPn: copy n bits, aligned, native-endian * we don't use memcpy for CPn because memcpy's args aren't assumed * to be aligned */ typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS davs2_union16_t; typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS davs2_union32_t; typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS davs2_union64_t; #define M16(src) (((davs2_union16_t *)(src))->i) #define M32(src) (((davs2_union32_t *)(src))->i) #define M64(src) (((davs2_union64_t *)(src))->i) #define CP16(dst,src) M16(dst) = M16(src) #define CP32(dst,src) M32(dst) = M32(src) #define CP64(dst,src) M64(dst) = M64(src) /* --------------------------------------------------------------------------- * assert */ #define DAVS2_ASSERT(expression, ...) if (!(expression)) { davs2_log(NULL, DAVS2_LOG_ERROR, __VA_ARGS__); } /* --------------------------------------------------------------------------- * list */ #define xl_init FPFX(xl_init) int xl_init (xlist_t *const xlist); #define xl_destroy FPFX(xl_destroy) void xl_destroy (xlist_t *const xlist); #define xl_append FPFX(xl_append) void xl_append (xlist_t *const xlist, void *node); #define xl_remove_head FPFX(xl_remove_head) void *xl_remove_head (xlist_t *const xlist, const int wait); #define xl_remove_head_ex FPFX(xl_remove_head_ex) void *xl_remove_head_ex(xlist_t *const xlist); #ifdef __cplusplus } #endif #endif // DAVS2_COMMON_H davs2-1.6/source/common/cpu.cc000066400000000000000000000347501337322544400163050ustar00rootroot00000000000000/* * cpu.cc * * Description of this file: * CPU-Processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * * -------------------------------------------------------------------------- * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * Steve Borho * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. */ #include "common.h" #include "cpu.h" #if SYS_MACOSX || SYS_FREEBSD #include #include #endif #if SYS_OPENBSD #include #include #include #endif #if ARCH_ARM #include #include static sigjmp_buf jmpbuf; static volatile sig_atomic_t canjump = 0; static void sigill_handler(int sig) { if (!canjump) { signal(sig, SIG_DFL); raise(sig); } canjump = 0; siglongjmp(jmpbuf, 1); } #endif // if ARCH_ARM #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- */ typedef struct { const char *name; int flags; } davs2_cpu_name_t; /* --------------------------------------------------------------------------- */ static const davs2_cpu_name_t davs2_cpu_names[] = { #if ARCH_X86 || ARCH_X86_64 #define MMX2 DAVS2_CPU_MMX | DAVS2_CPU_MMX2 | DAVS2_CPU_CMOV { "MMX2", MMX2 }, { "MMXEXT", MMX2 }, { "SSE", MMX2 | DAVS2_CPU_SSE }, #define SSE2 MMX2 | DAVS2_CPU_SSE | DAVS2_CPU_SSE2 { "SSE2Slow", SSE2 | DAVS2_CPU_SSE2_IS_SLOW }, { "SSE2", SSE2 }, { "SSE2Fast", SSE2 | DAVS2_CPU_SSE2_IS_FAST }, { "SSE3", SSE2 | DAVS2_CPU_SSE3 }, { "SSSE3", SSE2 | DAVS2_CPU_SSE3 | DAVS2_CPU_SSSE3 }, { "SSE4.1", SSE2 | DAVS2_CPU_SSE3 | DAVS2_CPU_SSSE3 | DAVS2_CPU_SSE4 }, { "SSE4", SSE2 | DAVS2_CPU_SSE3 | DAVS2_CPU_SSSE3 | DAVS2_CPU_SSE4 }, { "SSE4.2", SSE2 | DAVS2_CPU_SSE3 | DAVS2_CPU_SSSE3 | DAVS2_CPU_SSE4 | DAVS2_CPU_SSE42 }, #define AVX SSE2 | DAVS2_CPU_SSE3 | DAVS2_CPU_SSSE3 | DAVS2_CPU_SSE4 | DAVS2_CPU_SSE42 | DAVS2_CPU_AVX { "AVX", AVX }, { "XOP", AVX | DAVS2_CPU_XOP }, { "FMA4", AVX | DAVS2_CPU_FMA4 }, { "AVX2", AVX | DAVS2_CPU_AVX2 }, { "FMA3", AVX | DAVS2_CPU_FMA3 }, #undef AVX #undef SSE2 #undef MMX2 { "Cache32", DAVS2_CPU_CACHELINE_32 }, { "Cache64", DAVS2_CPU_CACHELINE_64 }, { "LZCNT", DAVS2_CPU_LZCNT }, { "BMI1", DAVS2_CPU_BMI1 }, { "BMI2", DAVS2_CPU_BMI1 | DAVS2_CPU_BMI2 }, { "SlowCTZ", DAVS2_CPU_SLOW_CTZ }, { "SlowAtom", DAVS2_CPU_SLOW_ATOM }, { "SlowPshufb", DAVS2_CPU_SLOW_PSHUFB }, { "SlowPalignr", DAVS2_CPU_SLOW_PALIGNR }, { "SlowShuffle", DAVS2_CPU_SLOW_SHUFFLE }, { "UnalignedStack", DAVS2_CPU_STACK_MOD4 }, #elif ARCH_ARM { "ARMv6", DAVS2_CPU_ARMV6 }, { "NEON", DAVS2_CPU_NEON }, { "FastNeonMRC", DAVS2_CPU_FAST_NEON_MRC }, #endif // if DAVS2_ARCH_X86 { "", 0 } }; #ifdef __cplusplus } #endif /* --------------------------------------------------------------------------- */ char *davs2_get_simd_capabilities(char *buf, uint32_t cpuid) { char *p = buf; for (int i = 0; davs2_cpu_names[i].flags; i++) { if (!strcmp(davs2_cpu_names[i].name, "SSE") && (cpuid & DAVS2_CPU_SSE2)) continue; if (!strcmp(davs2_cpu_names[i].name, "SSE2") && (cpuid & (DAVS2_CPU_SSE2_IS_FAST | DAVS2_CPU_SSE2_IS_SLOW))) continue; if (!strcmp(davs2_cpu_names[i].name, "SSE3") && (cpuid & DAVS2_CPU_SSSE3 || !(cpuid & DAVS2_CPU_CACHELINE_64))) continue; if (!strcmp(davs2_cpu_names[i].name, "SSE4.1") && (cpuid & DAVS2_CPU_SSE42)) continue; if (!strcmp(davs2_cpu_names[i].name, "BMI1") && (cpuid & DAVS2_CPU_BMI2)) continue; if ((cpuid & davs2_cpu_names[i].flags) == davs2_cpu_names[i].flags && (!i || davs2_cpu_names[i].flags != davs2_cpu_names[i - 1].flags)) p += sprintf(p, " %s", davs2_cpu_names[i].name); } if (p == buf) { sprintf(p, " none! 0x%x", cpuid); } return buf; } #if !ARCH_X86_64 /* */ int davs2_cpu_cpuid_test(void); #endif #if HAVE_MMX /* --------------------------------------------------------------------------- */ uint32_t davs2_cpu_detect(void) { uint32_t cpuid = 0; uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = { 0 }; uint32_t max_extended_cap, max_basic_cap; #if !ARCH_X86_64 if (!davs2_cpu_cpuid_test()) { return 0; } #endif davs2_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1); max_basic_cap = eax; if (max_basic_cap == 0) { return 0; } davs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); if (edx & 0x00800000) { cpuid |= DAVS2_CPU_MMX; } else { return cpuid; } if (edx & 0x02000000) { cpuid |= DAVS2_CPU_MMX2 | DAVS2_CPU_SSE; } if (edx & 0x00008000) { cpuid |= DAVS2_CPU_CMOV; } else { return cpuid; } if (edx & 0x04000000) { cpuid |= DAVS2_CPU_SSE2; } if (ecx & 0x00000001) { cpuid |= DAVS2_CPU_SSE3; } if (ecx & 0x00000200) { cpuid |= DAVS2_CPU_SSSE3; } if (ecx & 0x00080000) { cpuid |= DAVS2_CPU_SSE4; } if (ecx & 0x00100000) { cpuid |= DAVS2_CPU_SSE42; } /* Check OXSAVE and AVX bits */ if ((ecx & 0x18000000) == 0x18000000) { /* Check for OS support */ davs2_cpu_xgetbv(0, &eax, &edx); if ((eax & 0x6) == 0x6) { cpuid |= DAVS2_CPU_AVX; if (ecx & 0x00001000) { cpuid |= DAVS2_CPU_FMA3; } } } if (max_basic_cap >= 7) { davs2_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); /* AVX2 requires OS support, but BMI1/2 don't. */ if ((cpuid & DAVS2_CPU_AVX) && (ebx & 0x00000020)) { cpuid |= DAVS2_CPU_AVX2; } if (ebx & 0x00000008) { cpuid |= DAVS2_CPU_BMI1; if (ebx & 0x00000100) { cpuid |= DAVS2_CPU_BMI2; } } } if (cpuid & DAVS2_CPU_SSSE3) { cpuid |= DAVS2_CPU_SSE2_IS_FAST; } davs2_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); max_extended_cap = eax; if (max_extended_cap >= 0x80000001) { davs2_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if (ecx & 0x00000020) cpuid |= DAVS2_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ if (ecx & 0x00000040) { /* SSE4a, AMD only */ int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); cpuid |= DAVS2_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ if (family == 0x14) { cpuid &= ~DAVS2_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ cpuid |= DAVS2_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ cpuid |= DAVS2_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } if (family == 0x16) { cpuid |= DAVS2_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough * compared to alternate instruction sequences that this * is equal or faster on almost all such functions. */ } } if (cpuid & DAVS2_CPU_AVX) { if (ecx & 0x00000800) { /* XOP */ cpuid |= DAVS2_CPU_XOP; } if (ecx & 0x00010000) { /* FMA4 */ cpuid |= DAVS2_CPU_FMA4; } } if (!strcmp((char*)vendor, "AuthenticAMD")) { if (edx & 0x00400000) { cpuid |= DAVS2_CPU_MMX2; } if (!(cpuid & DAVS2_CPU_LZCNT)) { cpuid |= DAVS2_CPU_SLOW_CTZ; } if ((cpuid & DAVS2_CPU_SSE2) && !(cpuid & DAVS2_CPU_SSE2_IS_FAST)) { cpuid |= DAVS2_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } } } if (!strcmp((char*)vendor, "GenuineIntel")) { int family, model; davs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); if (family == 6) { /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") * theoretically support sse2, but it's significantly slower than mmx for * almost all of x264's functions, so let's just pretend they don't. */ if (model == 9 || model == 13 || model == 14) { cpuid &= ~(DAVS2_CPU_SSE2 | DAVS2_CPU_SSE3); //DAVS2_CHECK(!(cpuid & (DAVS2_CPU_SSSE3 | DAVS2_CPU_SSE4)), "unexpected CPU ID %d\n", cpuid); } else if (model == 28) { /* Detect Atom CPU */ cpuid |= DAVS2_CPU_SLOW_ATOM; cpuid |= DAVS2_CPU_SLOW_CTZ; cpuid |= DAVS2_CPU_SLOW_PSHUFB; } else if ((cpuid & DAVS2_CPU_SSSE3) && !(cpuid & DAVS2_CPU_SSE4) && model < 23) { /* Conroe has a slow shuffle unit. Check the model number to make sure not * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ cpuid |= DAVS2_CPU_SLOW_SHUFFLE; } } } if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpuid & DAVS2_CPU_SSE42)) { /* cacheline size is specified in 3 places, any of which may be missing */ int cache; davs2_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); cache = (ebx & 0xff00) >> 5; // cflush size if (!cache && max_extended_cap >= 0x80000006) { davs2_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cache = ecx & 0xff; // cacheline size } if (!cache && max_basic_cap >= 2) { // Cache and TLB Information static const uint8_t cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; static const uint8_t cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; uint32_t buf[4]; int max, i = 0, j; do { davs2_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3); max = buf[0] & 0xff; buf[0] &= ~0xff; for (j = 0; j < 4; j++) { if (!(buf[j] >> 31)) { while (buf[j]) { if (strchr((const char *)cache32_ids, buf[j] & 0xff)) { cache = 32; } if (strchr((const char *)cache64_ids, buf[j] & 0xff)) { cache = 64; } buf[j] >>= 8; } } } } while (++i < max); } if (cache == 32) { cpuid |= DAVS2_CPU_CACHELINE_32; } else if (cache == 64) { cpuid |= DAVS2_CPU_CACHELINE_64; } else { davs2_log(NULL, DAVS2_LOG_WARNING, "unable to determine cacheline size\n"); } } #ifdef BROKEN_STACK_ALIGNMENT cpuid |= DAVS2_CPU_STACK_MOD4; #endif return cpuid; } #endif // if DAVS2_ARCH_X86 #if SYS_LINUX && !(defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__)) /* --------------------------------------------------------------------------- */ int sched_getaffinity(pid_t pid, size_t cpusetsize, cpu_set_t *mask); #endif /* --------------------------------------------------------------------------- */ int davs2_cpu_num_processors(void) { #if !HAVE_THREAD return 1; #elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) return 2; #elif SYS_WINDOWS return davs2_thread_num_processors_np(); #elif SYS_LINUX unsigned int bit; int np = 0; cpu_set_t p_aff; memset(&p_aff, 0, sizeof(p_aff)); sched_getaffinity(0, sizeof(p_aff), &p_aff); for (bit = 0; bit < sizeof(p_aff); bit++) { np += (((uint8_t *)& p_aff)[bit / 8] >> (bit % 8)) & 1; } return np; #elif SYS_BEOS system_info info; get_system_info(&info); return info.cpu_count; #elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD int numberOfCPUs; size_t length = sizeof (numberOfCPUs); #if SYS_OPENBSD int mib[2] = { CTL_HW, HW_NCPU }; if(sysctl(mib, 2, &numberOfCPUs, &length, NULL, 0)) #else if(sysctlbyname("hw.ncpu", &numberOfCPUs, &length, NULL, 0)) #endif { numberOfCPUs = 1; } return numberOfCPUs; #else return 1; #endif } davs2-1.6/source/common/cpu.h000066400000000000000000000061451337322544400161440ustar00rootroot00000000000000/* * cpu.h * * Description of this file: * CPU-Processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_CPU_H #define DAVS2_CPU_H #ifdef __cplusplus extern "C" { #endif #define davs2_cpu_detect FPFX(cpu_detect) uint32_t davs2_cpu_detect(void); #define davs2_cpu_num_processors FPFX(cpu_num_processors) int davs2_cpu_num_processors(void); #define avs_cpu_emms FPFX(avs_cpu_emms) void avs_cpu_emms(void); #define avs_cpu_mask_misalign_sse FPFX(avs_cpu_mask_misalign_sse) void avs_cpu_mask_misalign_sse(void); #define avs_cpu_sfence FPFX(avs_cpu_sfence) void avs_cpu_sfence(void); #define davs2_get_simd_capabilities FPFX(get_simd_capabilities) char *davs2_get_simd_capabilities(char *buf, uint32_t cpuid); #if HAVE_MMX #define davs2_cpu_cpuid FPFX(cpu_cpuid) uint32_t davs2_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); #define davs2_cpu_xgetbv FPFX(cpu_xgetbv) void davs2_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx); #define avs_emms() avs_cpu_emms() #else #define avs_emms() #endif #define avs_sfence avs_cpu_sfence /* kluge: * gcc can't give variables any greater alignment than the stack frame has. * We need 16 byte alignment for SSE2, so here we make sure that the stack is * aligned to 16 bytes. * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this * problem, but I don't want to require such a new version. * This applies only to x86_32, since other architectures that need alignment * also have ABIs that ensure aligned stack. */ #if ARCH_X86 && HAVE_MMX //int xavs_stack_align(void(*func) (xavs_t *), xavs_t * arg); //#define avs_stack_align(func,arg) avs_stack_align((void (*)(xavs_t*))func,arg) #else #define avs_stack_align(func,...) func(__VA_ARGS__) #endif #define avs_cpu_restore FPFX(avs_cpu_restore) void avs_cpu_restore(uint32_t cpuid); #ifdef __cplusplus } #endif #endif // DAVS2_CPU_H davs2-1.6/source/common/cu.cc000066400000000000000000001534161337322544400161260ustar00rootroot00000000000000/* * cu.cc * * Description of this file: * CU Processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "cu.h" #include "vlc.h" #include "transform.h" #include "intra.h" #include "predict.h" #include "block_info.h" #include "aec.h" #include "mc.h" #include "sao.h" #include "quant.h" #include "scantab.h" /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ static const int tab_b8xy_to_zigzag[8][8] = { { 0, 1, 4, 5, 16, 17, 20, 21 }, { 2, 3, 6, 7, 18, 19, 22, 23 }, { 8, 9, 12, 13, 24, 25, 28, 29 }, { 10, 11, 14, 15, 26, 27, 30, 31 }, { 32, 33, 36, 37, 48, 49, 52, 53 }, { 34, 35, 38, 39, 50, 51, 54, 55 }, { 40, 41, 44, 45, 56, 57, 60, 61 }, { 42, 43, 46, 47, 58, 59, 62, 63 } }; /* --------------------------------------------------------------------------- */ const uint8_t QP_SCALE_CR[64] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, }; /* --------------------------------------------------------------------------- */ static const int8_t dmh_pos[DMH_MODE_NUM + DMH_MODE_NUM - 1][2][2] = { { { 0, 0 }, { 0, 0 } }, { { -1, 0 }, { 1, 0 } }, { { 0, -1 }, { 0, 1 } }, { { -1, 1 }, { 1, -1 } }, { { -1, -1 }, { 1, 1 } }, { { -2, 0 }, { 2, 0 } }, { { 0, -2 }, { 0, 2 } }, { { -2, 2 }, { 2, -2 } }, { { -2, -2 }, { 2, 2 } } }; /* --------------------------------------------------------------------------- */ const int16_t IQ_SHIFT[80] = { 15, 15, 15, 15, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6 }; /* --------------------------------------------------------------------------- */ const uint16_t IQ_TAB[80] = { 32768, 36061, 38968, 42495, 46341, 50535, 55437, 60424, 32932, 35734, 38968, 42495, 46177, 50535, 55109, 59933, 65535, 35734, 38968, 42577, 46341, 50617, 55027, 60097, 32809, 35734, 38968, 42454, 46382, 50576, 55109, 60056, 65535, 35734, 38968, 42495, 46320, 50515, 55109, 60076, 65535, 35744, 38968, 42495, 46341, 50535, 55099, 60087, 65535, 35734, 38973, 42500, 46341, 50535, 55109, 60097, 32771, 35734, 38965, 42497, 46341, 50535, 55109, 60099, 32768, 36061, 38968, 42495, 46341, 50535, 55437, 60424, 32932, 35734, 38968, 42495, 46177, 50535, 55109, 59933 }; #if AVS2_TRACE extern int symbolCount; #endif /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * used for debug */ static INLINE bool_t is_inside_cu(int cu_pix_x, int cu_pix_y, int i_cu_level, int i_pix_x, int i_pix_y) { int cu_size = 1 << i_cu_level; return cu_pix_x <= i_pix_x && (cu_pix_x + cu_size) > i_pix_x && cu_pix_y <= i_pix_y && (cu_pix_y + cu_size) > i_pix_y; } /* --------------------------------------------------------------------------- * obtain the pos and size of prediction units (PUs) */ static ALWAYS_INLINE void cu_init_prediction_units(davs2_t *h, cu_t *p_cu) { /* --------------------------------------------------------------------------- */ static const int NUM_PREDICTION_UNIT[MAX_PRED_MODES] = {// [mode] 1, // 0: 8x8, ---, ---, --- (PRED_SKIP ) 1, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) 2, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) 2, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) 2, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) 2, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) 2, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) 2, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) 1, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) 4, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) 4, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) 4 //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) }; static const cb_t CODING_BLOCK_INFO[MAX_PRED_MODES + 1][4] = {// [mode][block] // x, y, w, h x, y, w, h x, y, w, h x, y, w, h for block 0, 1, 2 and 3 {{{0, 0, 8, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 0: 8x8, ---, ---, --- (PRED_SKIP ) {{{0, 0, 8, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 1: 8x8, ---, ---, --- (PRED_2Nx2N ) {{{0, 0, 8, 4}}, {{0, 4, 8, 4}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 2: 8x4, 8x4, ---, --- (PRED_2NxN ) {{{0, 0, 4, 8}}, {{4, 0, 4, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 3: 4x8, 4x8, ---, --- (PRED_Nx2N ) {{{0, 0, 8, 2}}, {{0, 2, 8, 6}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 4: 8x2, 8x6, ---, --- (PRED_2NxnU ) {{{0, 0, 8, 6}}, {{0, 6, 8, 2}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 5: 8x6, 8x2, ---, --- (PRED_2NxnD ) {{{0, 0, 2, 8}}, {{2, 0, 6, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 6: 2x8, 6x8, ---, --- (PRED_nLx2N ) {{{0, 0, 6, 8}}, {{6, 0, 2, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 7: 6x8, 2x8, ---, --- (PRED_nRx2N ) {{{0, 0, 8, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // 8: 8x8, ---, ---, --- (PRED_I_2Nx2N) {{{0, 0, 4, 4}}, {{4, 0, 4, 4}}, {{0, 4, 4, 4}}, {{4, 4, 4, 4}}}, // 9: 4x4, 4x4, 4x4, 4x4 (PRED_I_NxN ) {{{0, 0, 8, 2}}, {{0, 2, 8, 2}}, {{0, 4, 8, 2}}, {{0, 6, 8, 2}}}, //10: 8x2, 8x2, 8x2, 8x2 (PRED_I_2Nxn ) {{{0, 0, 2, 8}}, {{2, 0, 2, 8}}, {{4, 0, 2, 8}}, {{6, 0, 2, 8}}}, //11: 2x8, 2x8, 2x8, 2x8 (PRED_I_nx2N ) {{{0, 0, 4, 4}}, {{4, 0, 4, 4}}, {{0, 4, 4, 4}}, {{4, 4, 4, 4}}}, // X: 4x4, 4x4, 4x4, 4x4 }; const int i_level = p_cu->i_cu_level; const int i_mode = p_cu->i_cu_type; const int shift_bits = i_level - MIN_CU_SIZE_IN_BIT; const int block_num = NUM_PREDICTION_UNIT[i_mode]; int ds_mode = p_cu->i_md_directskip_mode; int i; cb_t *p_cb = p_cu->pu; // memset(p_cb, 0, 4 * sizeof(cb_t)); // set for each block if (i_mode == PRED_SKIP) { ///! 一些特殊的Skip/Direct模式下如果CU超过8x8,则PU划分成4个 if (i_level > 3 && (h->i_frame_type == AVS2_P_SLICE || (h->i_frame_type == AVS2_F_SLICE && ds_mode == DS_NONE) || (h->i_frame_type == AVS2_B_SLICE && ds_mode == DS_NONE))) { p_cu->num_pu = 4; for (i = 0; i < 4; i++) { p_cb[i].v = CODING_BLOCK_INFO[PRED_I_nx2N + 1][i].v << shift_bits; } } else { p_cu->num_pu = 1; p_cb[0].v = CODING_BLOCK_INFO[PRED_SKIP][0].v << shift_bits; } } else { p_cu->num_pu = (int8_t)block_num; for (i = 0; i < block_num; i++) { p_cb[i].v = CODING_BLOCK_INFO[i_mode][i].v << shift_bits; } } } /* --------------------------------------------------------------------------- * obtain the pos and size of transform units (TUs) */ static ALWAYS_INLINE void cu_init_transform_units(cu_t *p_cu, cb_t *p_tu) { static const cb_t TU_SPLIT_INFO[TU_SPLIT_CROSS+1][4] = {// [mode][block] // x, y, w, h x, y, w, h x, y, w, h x, y, w, h for block 0, 1, 2 and 3 {{{0, 0, 8, 8}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}, {{0, 0, 0, 0}}}, // TU_SPLIT_NON {{{0, 0, 8, 2}}, {{0, 2, 8, 2}}, {{0, 4, 8, 2}}, {{0, 6, 8, 2}}}, // TU_SPLIT_HOR {{{0, 0, 2, 8}}, {{2, 0, 2, 8}}, {{4, 0, 2, 8}}, {{6, 0, 2, 8}}}, // TU_SPLIT_VER {{{0, 0, 4, 4}}, {{4, 0, 4, 4}}, {{0, 4, 4, 4}}, {{4, 4, 4, 4}}}, // TU_SPLIT_CROSS }; const int shift_bits = p_cu->i_cu_level - MIN_CU_SIZE_IN_BIT; const int i_tu_type = p_cu->i_trans_size; p_tu[0].v = TU_SPLIT_INFO[i_tu_type][0].v << shift_bits; p_tu[1].v = TU_SPLIT_INFO[i_tu_type][1].v << shift_bits; p_tu[2].v = TU_SPLIT_INFO[i_tu_type][2].v << shift_bits; p_tu[3].v = TU_SPLIT_INFO[i_tu_type][3].v << shift_bits; } /* --------------------------------------------------------------------------- * get neighboring MVs for MVP */ static void cu_get_neighbors(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, int bsx, int bsy) { neighbor_inter_t *neighbors = h->lcu.neighbor_inter; int cur_slice_idx = p_cu->i_slice_nr; int x0 = (pix_x >> MIN_PU_SIZE_IN_BIT); int y0 = (pix_y >> MIN_PU_SIZE_IN_BIT); int x1 = (bsx >> MIN_PU_SIZE_IN_BIT) + x0 - 1; int y1 = (bsy >> MIN_PU_SIZE_IN_BIT) + y0 - 1; /* 1. check whether the top-right 4x4 block is reconstructed */ int x_top_right_4x4_in_lcu = x1 - h->lcu.i_spu_x; int y_top_right_4x4_in_lcu = y0 - h->lcu.i_spu_y; int block_available_TR = h->p_tab_TR_avail[(y_top_right_4x4_in_lcu << (h->i_lcu_level - B4X4_IN_BIT)) + x_top_right_4x4_in_lcu]; /* 2. get neighboring blocks */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT ], x0 - 1, y0 ); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP ], x0 , y0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP2 ], x1 , y0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPLEFT ], x0 - 1, y0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT2 ], x0 - 1, y1 ); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPRIGHT], block_available_TR ? x1 + 1 : -1, y0 - 1); cu_get_neighbor_temporal(h, &neighbors[BLK_COLLOCATED], x0, y0); } /* --------------------------------------------------------------------------- */ static INLINE void cu_init(davs2_t *h, cu_t *p_cu, int i_level, int scu_xy, int pix_x) { assert(scu_xy >= 0 && scu_xy < h->i_size_in_scu); // reset syntax element entries in cu_t p_cu->i_cu_level = (int8_t)i_level; p_cu->i_qp = (int8_t)h->i_qp; p_cu->i_cu_type = PRED_SKIP; p_cu->i_cbp = 0; p_cu->c_ipred_mode = DC_PRED_C; p_cu->i_dmh_mode = 0; memset(p_cu->dct_pattern, 0, sizeof(p_cu->dct_pattern)); // check left CU h->lcu.i_left_cu_qp = (int8_t)h->i_qp; h->lcu.c_ipred_mode_ctx = 0; if (pix_x > 0) { cu_t *p_left_cu = &h->scu_data[scu_xy - 1]; if (p_left_cu->i_slice_nr == p_cu->i_slice_nr) { h->lcu.c_ipred_mode_ctx = p_left_cu->c_ipred_mode != 0; h->lcu.i_left_cu_qp = p_left_cu->i_qp; } } } /* --------------------------------------------------------------------------- */ static INLINE void cu_read_end(davs2_t *h, cu_t *p_cu, int i_level, int scu_xy) { cu_t *p_cu_iter = &h->scu_data[scu_xy]; int size_in_scu = 1 << (i_level - MIN_CU_SIZE_IN_BIT); int i; if (size_in_scu <= 1) { return; } /* the fist row */ for (i = 1; i < size_in_scu; i++) { memcpy(p_cu_iter + i, p_cu, sizeof(cu_t)); } /* the left rows */ for (i = 1; i < size_in_scu; i++) { p_cu_iter += h->i_width_in_scu; memcpy(p_cu_iter, p_cu, size_in_scu * sizeof(cu_t)); } } /* --------------------------------------------------------------------------- */ static int cu_read_intrapred_mode_luma(davs2_t *h, aec_t *p_aec, cu_t *p_cu, int b8, int bi, int bj) { int size_in_scu = 1 << (p_cu->i_cu_level - MIN_CU_SIZE_IN_BIT); int i_intramode = h->i_ipredmode; int8_t *p_intramode = h->p_ipredmode + bj * i_intramode + bi; int intra_mode_top = p_intramode[-i_intramode]; int intra_mode_left = p_intramode[-1]; int luma_mode = aec_read_intra_pmode(p_aec); int mpm[2]; int8_t real_luma_mode; #if AVS2_TRACE strncpy(p_aec->tracestring, "Ipred Mode", TRACESTRING_SIZE); #endif assert(IS_INTRA(p_cu) && b8 < 4 && b8 >= 0); AEC_RETURN_ON_ERROR(-1); mpm[0] = DAVS2_MIN(intra_mode_top, intra_mode_left); mpm[1] = DAVS2_MAX(intra_mode_top, intra_mode_left); if (mpm[0] == mpm[1]) { mpm[0] = DC_PRED; mpm[1] = (mpm[1] == DC_PRED) ? BI_PRED : mpm[1]; } real_luma_mode = (int8_t)((luma_mode < 0) ? mpm[luma_mode + 2] : luma_mode + (luma_mode >= mpm[0]) + (luma_mode + 1 >= mpm[1])); if (real_luma_mode < 0 || real_luma_mode >= NUM_INTRA_MODE) { davs2_log(h, DAVS2_LOG_ERROR, "invalid pred mode %2d. POC %3d, pixel (%3d, %3d), %2dx%2d", real_luma_mode, h->i_poc, bi << MIN_PU_SIZE_IN_BIT, bj << MIN_PU_SIZE_IN_BIT, size_in_scu << MIN_CU_SIZE_IN_BIT, size_in_scu << MIN_CU_SIZE_IN_BIT); real_luma_mode = (int8_t)davs2_clip3(real_luma_mode, 0, NUM_INTRA_MODE - 1); } p_cu->intra_pred_modes[b8] = real_luma_mode; // store intra prediction mode, for MPM of next blocks { int w_4x4 = size_in_scu << 1; int h_4x4 = size_in_scu << 1; int j; switch (p_cu->i_trans_size) { case TU_SPLIT_HOR: h_4x4 >>= 2; break; case TU_SPLIT_VER: w_4x4 >>= 2; break; case TU_SPLIT_CROSS: w_4x4 >>= 1; h_4x4 >>= 1; break; } for (j = 0; j < h_4x4; j++) { int i = (j == h_4x4 - 1) ? 0 : w_4x4 - 1; for (; i < w_4x4; i++) { p_intramode[i] = real_luma_mode; } p_intramode += i_intramode; } } return 0; } /* --------------------------------------------------------------------------- */ static void cu_store_references(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y) { int width_in_spu = h->i_width_in_spu; int block8_y = pix_y >> MIN_PU_SIZE_IN_BIT; int block8_x = pix_x >> MIN_PU_SIZE_IN_BIT; int idx_pu; for (idx_pu = 0; idx_pu < p_cu->num_pu; idx_pu++) { ref_idx_t *p_ref_1st; int8_t *p_dirpred; int8_t i_dir_pred; ref_idx_t ref_idx; int b8_x, b8_y; int r, c; cb_t pu; pu.v = p_cu->pu[idx_pu].v >> 2; b8_x = block8_x + pu.x; b8_y = block8_y + pu.y; i_dir_pred = (int8_t)p_cu->b8pdir[idx_pu]; ref_idx = p_cu->ref_idx[idx_pu]; p_dirpred = h->p_dirpred + b8_y * width_in_spu + b8_x; p_ref_1st = h->p_ref_idx + b8_y * width_in_spu + b8_x; for (r = pu.h; r != 0; r--) { for (c = 0; c < pu.w; c++) { p_ref_1st[c] = ref_idx; p_dirpred[c] = i_dir_pred; } p_ref_1st += width_in_spu; p_dirpred += width_in_spu; } } } /* --------------------------------------------------------------------------- */ static int cu_read_mv(davs2_t *h, aec_t *p_aec, int i_level, int scu_xy, int pix_x, int pix_y) { cu_t *p_cu = &h->scu_data[scu_xy]; int bframe = (h->i_frame_type == AVS2_B_SLICE); int idx_pu; int block8_y = pix_y >> MIN_PU_SIZE_IN_BIT; int block8_x = pix_x >> MIN_PU_SIZE_IN_BIT; int width_in_spu = h->i_width_in_spu; int distance_fwd; int distance_fwd_src; int distance_bwd; // TODO: 非 B FRAME 情况的初始值? assert(p_cu->i_cu_type != PRED_SKIP); if (h->i_frame_type == AVS2_F_SLICE && /*h->b_dmh &&*/ p_cu->b8pdir[0] == PDIR_FWD && p_cu->b8pdir[1] == PDIR_FWD && p_cu->b8pdir[2] == PDIR_FWD && p_cu->b8pdir[3] == PDIR_FWD) { //has forward vector if (!(i_level == B8X8_IN_BIT && p_cu->i_cu_type >= PRED_2NxN && p_cu->i_cu_type <= PRED_nRx2N)) { p_cu->i_dmh_mode = (int8_t)aec_read_dmh_mode(p_aec, p_cu->i_cu_level); AEC_RETURN_ON_ERROR(-1); #if AVS2_TRACE avs2_trace("dmh_mode = %3d\n", p_cu->i_dmh_mode); #endif } else { p_cu->i_dmh_mode = 0; } } //===== READ PDIR_FWD MOTION VECTORS ===== for (idx_pu = 0; idx_pu < p_cu->num_pu; idx_pu++) { if (p_cu->b8pdir[idx_pu] != PDIR_BWD) { int pu_pix_x = p_cu->pu[idx_pu].x; int pu_pix_y = p_cu->pu[idx_pu].y; int bsx = p_cu->pu[idx_pu].w; int bsy = p_cu->pu[idx_pu].h; int i8 = block8_x + (pu_pix_x >> 2); int j8 = block8_y + (pu_pix_y >> 2); int refframe = h->p_ref_idx[j8 * width_in_spu + i8].r[0]; mv_t mv, mvp; int ii, jj; // first make mv-prediction int pu_mvp_type = get_pu_type_for_mvp(bsx, bsy, pu_pix_x, pu_pix_y); get_mvp_default(h, p_cu, pix_x + pu_pix_x, pix_y + pu_pix_y, &mvp, 0, refframe, bsx, pu_mvp_type); bsx >>= MIN_PU_SIZE_IN_BIT; bsy >>= MIN_PU_SIZE_IN_BIT; if (h->i_frame_type != AVS2_S_SLICE) { //no mvd for S frame, just set it to 0 mv_t mvd; aec_read_mvds(p_aec, &mvd); pmvr_mv_derivation(h, &mv, &mvd, &mvp); #if AVS2_TRACE avs2_trace("@%d FMVD (pred %3d)\t\t\t%d \n", symbolCount++, mvp.x, mvd.x); avs2_trace("@%d FMVD (pred %3d)\t\t\t%d \n", symbolCount++, mvp.y, mvd.y); #endif AEC_RETURN_ON_ERROR(-1); } else { mv.v = mvp.v; } if (bframe) { mv_t *p_mv_1st = h->p_tmv_1st + j8 * width_in_spu + i8; for (jj = 0; jj < bsy; jj++) { for (ii = 0; ii < bsx; ii++) { p_mv_1st[ii] = mv; } p_mv_1st += width_in_spu; } p_cu->mv[idx_pu][0] = mv; } else { mv_t *p_mv_1st = h->p_tmv_1st + j8 * width_in_spu + i8; mv_t *p_mv_2nd = h->p_tmv_2nd + j8 * width_in_spu + i8; mv_t mv_2nd; if (p_cu->b8pdir[idx_pu] == PDIR_DUAL) { int distance_1st = get_distance_index_p(h, refframe); int distance_1st_src = get_distance_index_p_scale(h, refframe); int distance_2nd = get_distance_index_p(h, !refframe); mv_2nd.x = scale_mv_skip(h, mv.x, distance_2nd, distance_1st_src); mv_2nd.y = scale_mv_skip_y(h, mv.y, distance_2nd, distance_1st, distance_1st_src); } else { mv_2nd.v = 0; } p_cu->mv[idx_pu][0] = mv; p_cu->mv[idx_pu][1] = mv_2nd; for (jj = 0; jj < bsy; jj++) { for (ii = 0; ii < bsx; ii++) { p_mv_1st[ii] = mv; p_mv_2nd[ii] = mv_2nd; } p_mv_1st += width_in_spu; p_mv_2nd += width_in_spu; } } } } if (!bframe) { return 0; } assert(h->i_pic_coding_type == FRAME); { distance_fwd = get_distance_index_b(h, B_FWD); // fwd distance_fwd_src = get_distance_index_b_scale(h, B_FWD); distance_bwd = get_distance_index_b(h, B_BWD); // bwd } //===== READ PDIR_BWD MOTION VECTORS ===== for (idx_pu = 0; idx_pu< p_cu->num_pu; idx_pu++) { if (p_cu->b8pdir[idx_pu] != PDIR_FWD) { //has backward vector int pu_pix_x = p_cu->pu[idx_pu].x; int pu_pix_y = p_cu->pu[idx_pu].y; int bsx = p_cu->pu[idx_pu].w; int bsy = p_cu->pu[idx_pu].h; int i8 = block8_x + (pu_pix_x >> 2); int j8 = block8_y + (pu_pix_y >> 2); int refframe = h->p_ref_idx[j8 * width_in_spu + i8].r[1]; mv_t *p_mv_2nd = h->p_tmv_2nd + j8 * width_in_spu + i8; mv_t mv, mvp; int ii, jj; int pu_mvp_type = get_pu_type_for_mvp(bsx, bsy, pu_pix_x, pu_pix_y); get_mvp_default(h, p_cu, pix_x + pu_pix_x, pix_y + pu_pix_y, &mvp, 1, refframe, bsx, pu_mvp_type); bsx >>= MIN_PU_SIZE_IN_BIT; bsy >>= MIN_PU_SIZE_IN_BIT; if (p_cu->b8pdir[idx_pu] == PDIR_SYM) { mv_t mv_1st; mv_1st = h->p_tmv_1st[j8 * width_in_spu + i8]; mv.x = -scale_mv_skip (h, mv_1st.x, distance_bwd, distance_fwd_src); mv.y = -scale_mv_skip_y(h, mv_1st.y, distance_bwd, distance_fwd, distance_fwd_src); } else { mv_t mvd; aec_read_mvds(p_aec, &mvd); pmvr_mv_derivation(h, &mv, &mvd, &mvp); #if AVS2_TRACE avs2_trace("@%d BMVD (pred %3d)\t\t\t%d \n", symbolCount++, mvp.x, mvd.x); avs2_trace("@%d BMVD (pred %3d)\t\t\t%d \n", symbolCount++, mvp.y, mvd.y); #endif AEC_RETURN_ON_ERROR(-1); } p_cu->mv[idx_pu][1] = mv; for (jj = 0; jj < bsy; jj++) { for (ii = 0; ii < bsx; ii++) { p_mv_2nd[ii] = mv; } p_mv_2nd += width_in_spu; } } } return 0; } /* --------------------------------------------------------------------------- * get all coefficients of one CU */ static int cu_read_all_coeffs(davs2_t *h, aec_t *p_aec, cu_t *p_cu) { runlevel_t *runlevel = &h->lcu.cg_info; int idx_cu_zscan = h->lcu.idx_cu_zscan_aec; #if CTRL_AEC_THREAD coeff_t *coeff_y = &h->lcu.lcu_aec->rec_info.coeff_buf_y [idx_cu_zscan << 6]; coeff_t *coeff_u = &h->lcu.lcu_aec->rec_info.coeff_buf_uv[0][idx_cu_zscan << 4]; coeff_t *coeff_v = &h->lcu.lcu_aec->rec_info.coeff_buf_uv[1][idx_cu_zscan << 4]; #else coeff_t *coeff_y = &h->lcu.rec_info.coeff_buf_y [idx_cu_zscan << 6]; coeff_t *coeff_u = &h->lcu.rec_info.coeff_buf_uv[0][idx_cu_zscan << 4]; coeff_t *coeff_v = &h->lcu.rec_info.coeff_buf_uv[1][idx_cu_zscan << 4]; #endif int bit_size = p_cu->i_cu_level; int i_tu_level = p_cu->i_cu_level; // 与变换块中包含的系数相关 int b8; int uv; /*if (h->i_pic_coding_type == FRAME)*/ { runlevel->p_ctx_run = p_aec->syn_ctx.coeff_run[0]; runlevel->p_ctx_level = p_aec->syn_ctx.coeff_level; runlevel->p_ctx_sig_cg = p_aec->syn_ctx.sig_cg_contexts; runlevel->p_ctx_last_cg = p_aec->syn_ctx.last_cg_contexts; runlevel->p_ctx_last_pos_in_cg = p_aec->syn_ctx.last_coeff_pos; } // luma coefficients if (p_cu->i_trans_size == TU_SPLIT_NON) { i_tu_level = DAVS2_MIN(3, i_tu_level - B4X4_IN_BIT); runlevel->avs_scan = tab_scan_coeff[i_tu_level][TU_SPLIT_NON]; runlevel->cg_scan = tab_scan_cg[i_tu_level][TU_SPLIT_NON]; if (p_cu->i_cbp & 0x0F) { int intra_pred_class = IS_INTRA(p_cu) ? tab_intra_mode_scan_type[p_cu->intra_pred_modes[0]] : INTRA_PRED_DC_DIAG; int b_swap_xy = (IS_INTRA(p_cu) && intra_pred_class == INTRA_PRED_HOR); int blocksize = 1 << (i_tu_level + B4X4_IN_BIT); int shift, scale; int wq_size_id = DAVS2_MIN(3, bit_size - B4X4_IN_BIT); cu_get_quant_params(h, p_cu->i_qp, bit_size - (p_cu->i_trans_size != TU_SPLIT_NON), &shift, &scale); #if !CTRL_AEC_THREAD gf_davs2.fast_memzero(coeff_y, sizeof(coeff_t) * blocksize * blocksize); #endif p_cu->dct_pattern[0] = cu_get_block_coeffs(p_aec, runlevel, p_cu, coeff_y, blocksize, blocksize, i_tu_level, 1, intra_pred_class, b_swap_xy, scale, shift, wq_size_id); } } else { int b_wavelet_conducted = (bit_size == B64X64_IN_BIT && p_cu->i_trans_size != TU_SPLIT_CROSS); cb_t tus[4]; int shift, scale; int wq_size_id = DAVS2_MIN(3, bit_size - B4X4_IN_BIT); cu_init_transform_units(p_cu, tus); tus[0].v >>= b_wavelet_conducted; tus[1].v >>= b_wavelet_conducted; tus[2].v >>= b_wavelet_conducted; tus[3].v >>= b_wavelet_conducted; i_tu_level -= B8X8_IN_BIT; i_tu_level -= b_wavelet_conducted; cu_get_quant_params(h, p_cu->i_qp, p_cu->i_cu_level - (p_cu->i_trans_size != TU_SPLIT_NON), &shift, &scale); if (p_cu->i_trans_size == TU_SPLIT_CROSS) { wq_size_id = DAVS2_MIN(3, bit_size - B8X8_IN_BIT); } else { wq_size_id = bit_size - B8X8_IN_BIT; wq_size_id -= (p_cu->i_cu_level == B64X64_IN_BIT); } runlevel->avs_scan = tab_scan_coeff[i_tu_level][p_cu->i_trans_size]; runlevel->cg_scan = tab_scan_cg[i_tu_level][p_cu->i_trans_size]; for (b8 = 0; b8 < 4; b8++) { /* all 4 blocks */ if (p_cu->i_cbp & (1 << b8)) { int bsx = tus[b8].w; int bsy = tus[b8].h; int intra_pred_class = IS_INTRA(p_cu) ? tab_intra_mode_scan_type[p_cu->intra_pred_modes[b8]] : INTRA_PRED_DC_DIAG; int b_swap_xy = (IS_INTRA(p_cu) && intra_pred_class == INTRA_PRED_HOR && p_cu->i_cu_type != PRED_I_2Nxn && p_cu->i_cu_type != PRED_I_nx2N); coeff_t *p_res = coeff_y + (b8 << ((bit_size - 1) << 1)); #if !CTRL_AEC_THREAD gf_davs2.fast_memzero(p_res, sizeof(coeff_t) * bsx * bsy); #endif p_cu->dct_pattern[b8] = cu_get_block_coeffs(p_aec, runlevel, p_cu, p_res, bsx, bsy, i_tu_level, 1, intra_pred_class, b_swap_xy, scale, shift, wq_size_id); if (p_cu->dct_pattern[b8] < 0) { return -1; } } } } // adaptive frequency weighting quantization i_tu_level = p_cu->i_cu_level - B8X8_IN_BIT; runlevel->avs_scan = tab_scan_coeff[i_tu_level][TU_SPLIT_NON]; runlevel->cg_scan = tab_scan_cg[i_tu_level][TU_SPLIT_NON]; /*if (h->i_pic_coding_type == FRAME)*/ { runlevel->p_ctx_run = p_aec->syn_ctx.coeff_run[1]; runlevel->p_ctx_level = p_aec->syn_ctx.coeff_level + 20; runlevel->p_ctx_sig_cg = p_aec->syn_ctx.sig_cg_contexts + NUM_SIGCG_CTX_LUMA; runlevel->p_ctx_last_cg = p_aec->syn_ctx.last_cg_contexts + NUM_LAST_CG_CTX_LUMA; runlevel->p_ctx_last_pos_in_cg = p_aec->syn_ctx.last_coeff_pos + NUM_LAST_POS_CTX_LUMA; } if (h->i_chroma_format != CHROMA_400) { int wq_size_id = p_cu->i_cu_level - 1; for (uv = 0; uv < 2; uv++) { if ((p_cu->i_cbp >> (uv + 4)) & 0x1) { int blocksize = 1 << wq_size_id; coeff_t *p_res = uv ? coeff_v : coeff_u; int shift, scale; #if !CTRL_AEC_THREAD gf_davs2.fast_memzero(p_res, sizeof(coeff_t) * blocksize * blocksize); #endif cu_get_quant_params(h, cu_get_chroma_qp(h, p_cu->i_qp, uv), wq_size_id, &shift, &scale); p_cu->dct_pattern[4 + uv] = cu_get_block_coeffs(p_aec, runlevel, p_cu, p_res, blocksize, blocksize, i_tu_level, 0, INTRA_PRED_DC_DIAG, 0, scale, shift, wq_size_id); if (p_cu->dct_pattern[4 + uv] < 0) { return -1; } } } } return 0; } /* --------------------------------------------------------------------------- * get the syntax elements from the NAL, return cu_type */ static int cu_read_header(davs2_t *h, aec_t *p_aec, cu_t *p_cu, int pix_x, int pix_y, int *p_real_cu_type) { int real_cu_type; p_cu->i_md_directskip_mode = 0; if (h->i_frame_type == AVS2_S_SLICE) { real_cu_type = aec_read_cu_type_sframe(p_aec); } else { real_cu_type = aec_read_cu_type(p_aec, p_cu, h->i_frame_type, h->seq_info.enable_amp, h->seq_info.enable_mhp_skip, h->seq_info.enable_wsm, h->num_of_references); } AEC_RETURN_ON_ERROR(-1); *p_real_cu_type = real_cu_type; real_cu_type = DAVS2_MAX(0, real_cu_type); p_cu->i_cu_type = (int8_t)real_cu_type; /* 帧间预测的方向解析 */ if (h->i_frame_type != AVS2_I_SLICE && IS_INTER_MODE(real_cu_type)) { aec_read_inter_pred_dir(p_aec, p_cu, h); AEC_RETURN_ON_ERROR(-1); } if (IS_INTRA(p_cu)) { int size_8x8 = 1 << (p_cu->i_cu_level - B8X8_IN_BIT); int size_16x16 = 1 << (p_cu->i_cu_level - B16X16_IN_BIT); int y_4x4 = pix_y >> MIN_PU_SIZE_IN_BIT; int x_4x4 = pix_x >> MIN_PU_SIZE_IN_BIT; real_cu_type = aec_read_intra_cu_type(p_aec, p_cu, h->seq_info.enable_sdip, h); p_cu->i_cu_type = (int8_t)real_cu_type; AEC_RETURN_ON_ERROR(-1); /* Read luma block prediction modes */ if (cu_read_intrapred_mode_luma(h, p_aec, p_cu, 0, x_4x4, y_4x4) < 0) { return -1; } switch (real_cu_type) { case PRED_I_2Nxn: if (cu_read_intrapred_mode_luma(h, p_aec, p_cu, 1, x_4x4, y_4x4 + 1 * size_16x16) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 2, x_4x4, y_4x4 + 2 * size_16x16) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 3, x_4x4, y_4x4 + 3 * size_16x16) < 0) { return -1; } break; case PRED_I_nx2N: if (cu_read_intrapred_mode_luma(h, p_aec, p_cu, 1, x_4x4 + 1 * size_16x16, y_4x4) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 2, x_4x4 + 2 * size_16x16, y_4x4) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 3, x_4x4 + 3 * size_16x16, y_4x4) < 0) { return -1; } break; case PRED_I_NxN: if (cu_read_intrapred_mode_luma(h, p_aec, p_cu, 1, x_4x4 + size_8x8, y_4x4 + 0) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 2, x_4x4 + 0, y_4x4 + size_8x8) < 0 || cu_read_intrapred_mode_luma(h, p_aec, p_cu, 3, x_4x4 + size_8x8, y_4x4 + size_8x8) < 0) { return -1; } break; default: break; } #if AVS2_TRACE strncpy(p_aec->tracestring, "Chroma intra pred mode", TRACESTRING_SIZE); #endif if (h->i_chroma_format != CHROMA_400) { p_cu->c_ipred_mode = (int8_t)aec_read_intra_pmode_c(p_aec, h, p_cu->intra_pred_modes[0]); } else { p_cu->c_ipred_mode = 0; } AEC_RETURN_ON_ERROR(-1); } return 0; } /* --------------------------------------------------------------------------- * read CU information from bitstream */ static int cu_read_info(davs2_t *h, cu_t *p_cu, int i_level, int scu_xy, int pix_x, int pix_y) { aec_t *p_aec = &h->aec; int size_in_scu = 1 << (i_level - MIN_CU_SIZE_IN_BIT); int real_cu_type; /* 0, initial cu data */ cu_init(h, p_cu, i_level, scu_xy, pix_x); /* 1, read cu type and delta_QP * including PU partition, intra prediction mode, reference indexes */ if (cu_read_header(h, p_aec, p_cu, pix_x, pix_y, &real_cu_type) < 0) { return -1; } // get the size and pos of prediction units cu_init_prediction_units(h, p_cu); /* 2, read motion vectors and reference indexes */ if (IS_INTRA(p_cu)) { int i = 0; for (i = 0; i < 4; i++) { p_cu->ref_idx[i].r[0] = INVALID_REF; p_cu->ref_idx[i].r[1] = INVALID_REF; p_cu->b8pdir[i] = PDIR_INVALID; } // TODO: 由于帧级已初始化,此处无需重复设置 cu_store_references() cu_store_references(h, p_cu, pix_x, pix_y); } else if (p_cu->i_cu_type == PRED_SKIP) { cu_get_neighbors(h, p_cu, pix_x, pix_y, 1 << i_level, 1 << i_level); fill_mv_and_ref_for_skip(h, p_cu, pix_x, pix_y, size_in_scu); } else { cu_store_references(h, p_cu, pix_x, pix_y); if (cu_read_mv(h, p_aec, p_cu->i_cu_level, scu_xy, pix_x, pix_y) < 0) { return -1; } } /* 3, read CBP and coefficients */ if (real_cu_type < 0) { /* skip mode, no residual */ p_cu->i_qp = h->lcu.i_left_cu_qp; p_cu->i_trans_size = TU_SPLIT_NON; // cbp has been initialed as zero } else { // non-skip mode // read CBP if (cu_read_cbp(h, p_aec, p_cu, pix_x >> MIN_CU_SIZE_IN_BIT, pix_y >> MIN_CU_SIZE_IN_BIT) < 0) { return -1; } if (p_cu->i_cbp != 0) { if (cu_read_all_coeffs(h, p_aec, p_cu) < 0) { // read all coefficients return -1; } } } AEC_RETURN_ON_ERROR(-1); /* 4, finish decoding the cu data */ cu_read_end(h, p_cu, i_level, scu_xy); return 0; } /* --------------------------------------------------------------------------- */ void decoder_wait_lcu_row(davs2_t *h, davs2_frame_t *frame, int line) { line = DAVS2_MAX(line, 0); line = DAVS2_MIN(line, h->i_height_in_lcu - 1); if (frame->i_decoded_line < line && frame->num_decoded_lcu_in_row[line] < h->i_width_in_lcu + 1) { davs2_thread_mutex_lock(&frame->mutex_recon); while (frame->i_decoded_line < line && frame->num_decoded_lcu_in_row[line] < h->i_width_in_lcu + 1) { davs2_thread_cond_wait(&frame->conds_lcu_row[line], &frame->mutex_recon); } davs2_thread_mutex_unlock(&frame->mutex_recon); } } /* --------------------------------------------------------------------------- */ void decoder_wait_row(davs2_t *h, davs2_frame_t *frame, int max_y_in_pic) { int line = (max_y_in_pic + 8) >> h->i_lcu_level; line = DAVS2_MAX(line, 0); line = DAVS2_MIN(line, h->i_height_in_lcu - 1); decoder_wait_lcu_row(h, frame, line); } /* --------------------------------------------------------------------------- * img_size: 整像素精度的图像 宽度或高度 (整像素精度) * blk_size: 当前预测块的 宽度或高度 (整像素精度) * blk_pos: 当前块在图像中的 x/y 坐标 (整像素精度) * mv : MV 的 x/y 分量 (1/4像素精度) */ static INLINE int cu_get_mc_pos(int img_size, int blk_size, int blk_pos, int mv) { int imv = mv >> 2; // MV的整像素精度 int fmv = mv & 7; // MV的分像素精度部分,保留到 1/8 精度 if (blk_pos + imv < -blk_size - 8) { return ((-blk_size - 8) << 2) + (fmv); } else if (blk_pos + imv > img_size + 4) { return ((img_size + 4) << 2) + (fmv); } else { return (blk_pos << 2) + mv; } } /* --------------------------------------------------------------------------- * clip mv */ static INLINE void cu_get_mc_pos_mv(davs2_t *h, mv_t *mv, int pic_pix_x, int pic_pix_y, int blk_w, int blk_h) { mv->x = (int16_t)cu_get_mc_pos(h->i_width, blk_w, pic_pix_x, mv->x); mv->y = (int16_t)cu_get_mc_pos(h->i_height, blk_h, pic_pix_y, mv->y); } /* --------------------------------------------------------------------------- * decode one coding unit */ static int davs2_get_inter_pred(davs2_t *h, davs2_row_rec_t *row_rec, cu_t *p_cu, int ctu_x, int ctu_y) { static const int mv_shift = 2; int pu_idx; for (pu_idx = 0; pu_idx < p_cu->num_pu; pu_idx++) { int pix_x, pix_y, width, height; int vec1_x, vec1_y, vec2_x, vec2_y; int pred_dir; int ref_1st, ref_2nd; cb_t *pu = &p_cu->pu[pu_idx]; mv_t mv_1st, mv_2nd; davs2_frame_t *p_fref1, *p_fref2; p_fref1 = p_fref2 = NULL; ref_1st = ref_2nd = 0; mv_1st.v = mv_2nd.v = 0; pix_x = ctu_x + pu->x; pix_y = ctu_y + pu->y; width = pu->w; height = pu->h; pred_dir = p_cu->b8pdir[pu_idx]; if (pred_dir == PDIR_BWD) { ref_1st = B_BWD; mv_1st = p_cu->mv[pu_idx][1]; p_fref1 = h->fref[B_BWD]; } else if (pred_dir == PDIR_SYM || pred_dir == PDIR_BID) { mv_1st.v = p_cu->mv[pu_idx][0].v; mv_2nd.v = p_cu->mv[pu_idx][1].v; p_fref1 = h->fref[B_FWD]; p_fref2 = h->fref[B_BWD]; } else { /* FWD or DUAL */ int dmh_mode = p_cu->i_dmh_mode; ref_1st = p_cu->ref_idx[pu_idx].r[0]; mv_1st = p_cu->mv[pu_idx][0]; if (h->i_frame_type == AVS2_B_SLICE) { /* for B frame */ ref_1st = 0; p_fref1 = h->fref[B_FWD]; } else { if (pred_dir == PDIR_DUAL) { mv_2nd = p_cu->mv[pu_idx][1]; ref_2nd = p_cu->ref_idx[pu_idx].r[1]; p_fref1 = h->fref[ref_1st]; p_fref2 = h->fref[ref_2nd]; } else if (dmh_mode) { mv_2nd.x = mv_1st.x + dmh_pos[dmh_mode][1][0]; mv_2nd.y = mv_1st.y + dmh_pos[dmh_mode][1][1]; mv_1st.x += dmh_pos[dmh_mode][0][0]; mv_1st.y += dmh_pos[dmh_mode][0][1]; ref_2nd = ref_1st; p_fref1 = p_fref2 = h->fref[ref_1st]; } else { p_fref1 = h->fref[ref_1st]; } } } cu_get_mc_pos_mv(h, &mv_1st, pix_x + row_rec->ctu.i_pix_x, pix_y + row_rec->ctu.i_pix_y, width, height); vec1_x = mv_1st.x; vec1_y = mv_1st.y; cu_get_mc_pos_mv(h, &mv_2nd, pix_x + row_rec->ctu.i_pix_x, pix_y + row_rec->ctu.i_pix_y, width, height); vec2_x = mv_2nd.x; vec2_y = mv_2nd.y; // TODO: 出现背景帧参考情况下的参考帧管理需在RPS部分做好修改 // if (h->b_bkgnd_reference && h->num_of_references >= 2 && ref_1st == h->num_of_references - 1 && (h->i_frame_type == AVS2_P_SLICE || h->i_frame_type == AVS2_F_SLICE) && h->i_typeb != AVS2_S_SLICE) { // p_fref1 = h->f_background_ref; // } else if (h->i_typeb == AVS2_S_SLICE) { // p_fref1 = h->f_background_ref; // } /* luma prediction */ if (p_fref1 != NULL) { int i_pred = row_rec->ctu.i_fdec[IMG_Y]; int i_fref = h->fref[0]->i_stride[IMG_Y]; pel_t *p_pred = row_rec->ctu.p_fdec[IMG_Y] + pix_y * i_pred + pix_x; decoder_wait_row(h, p_fref1, (vec1_y >> mv_shift) + height + 8 + 4); mc_luma(h, p_pred, i_pred, vec1_x, vec1_y, width, height, p_fref1->planes[IMG_Y], i_fref); if (p_fref2 != NULL) { pel_t *p_temp = row_rec->pred_blk; decoder_wait_row(h, p_fref2, (vec2_y >> mv_shift) + height + 8 + 4); mc_luma(h, p_temp, LCU_STRIDE, vec2_x, vec2_y, width, height, p_fref2->planes[IMG_Y], i_fref); gf_davs2.block_avg(p_pred, i_pred, p_pred, i_pred, p_temp, LCU_STRIDE, width, height); } } else { davs2_log(h, DAVS2_LOG_ERROR, "non-existing reference frame. PB (%d, %d)", pix_x, pix_y); return -1; } /* chroma prediction */ if (h->i_chroma_format == CHROMA_420) { pix_x >>= 1; pix_y >>= 1; width >>= 1; height >>= 1; if (p_fref2 == NULL) { int i_fref = p_fref1->i_stride[IMG_U]; int i_pred = row_rec->ctu.i_fdec[IMG_U]; pel_t *p_pred = row_rec->ctu.p_fdec[IMG_U] + pix_y * i_pred + pix_x; mc_chroma(h, p_pred, i_pred, vec1_x, vec1_y, width, height, p_fref1->planes[IMG_U], i_fref); i_fref = p_fref1->i_stride[IMG_V]; i_pred = row_rec->ctu.i_fdec[IMG_V]; p_pred = row_rec->ctu.p_fdec[IMG_V] + pix_y * i_pred + pix_x; mc_chroma(h, p_pred, i_pred, vec1_x, vec1_y, width, height, p_fref1->planes[IMG_V], i_fref); } else { /* u component */ int i_fref = p_fref1->i_stride[IMG_U]; int i_pred = row_rec->ctu.i_fdec[IMG_U]; pel_t *p_pred = row_rec->ctu.p_fdec[IMG_U] + pix_y * i_pred + pix_x; pel_t *p_temp = row_rec->pred_blk; mc_chroma(h, p_pred, i_pred, vec1_x, vec1_y, width, height, p_fref1->planes[IMG_U], i_fref); mc_chroma(h, p_temp, LCU_STRIDE >> 1, vec2_x, vec2_y, width, height, p_fref2->planes[IMG_U], i_fref); gf_davs2.block_avg(p_pred, i_pred, p_pred, i_pred, p_temp, LCU_STRIDE >> 1, width, height); /* v component */ i_fref = p_fref1->i_stride[IMG_V]; i_pred = row_rec->ctu.i_fdec[IMG_V]; p_pred = row_rec->ctu.p_fdec[IMG_V] + pix_y * i_pred + pix_x; mc_chroma(h, p_pred, i_pred, vec1_x, vec1_y, width, height, p_fref1->planes[IMG_V], i_fref); mc_chroma(h, p_temp, LCU_STRIDE >> 1, vec2_x, vec2_y, width, height, p_fref2->planes[IMG_V], i_fref); gf_davs2.block_avg(p_pred, i_pred, p_pred, i_pred, p_temp, LCU_STRIDE >> 1, width, height); } } // chroma format YUV420 } return 0; } /* --------------------------------------------------------------------------- * reconstruct a CU */ static int cu_recon(davs2_t *h, davs2_row_rec_t *row_rec, cu_t *p_cu, int pix_x, int pix_y) { int ctu_x = pix_x - row_rec->ctu.i_pix_x; int ctu_y = pix_y - row_rec->ctu.i_pix_y; int ctu_c_x = ctu_x >> 1; int ctu_c_y = ctu_y >> 1; int blockidx; cb_t tus[4]; cu_init_transform_units(p_cu, tus); if (IS_INTRA(p_cu)) { /* intra cu */ /* 1, luma component, prediction and residual coding */ if (p_cu->i_trans_size == TU_SPLIT_NON) { davs2_get_intra_pred(row_rec, p_cu, p_cu->intra_pred_modes[0], ctu_x, ctu_y, tus[0].w, tus[0].h); if (p_cu->i_cbp & 0x0F) { davs2_get_recons(row_rec, p_cu, 0, &tus[0], ctu_x, ctu_y); } } else { for (blockidx = 0; blockidx < 4; blockidx++) { davs2_get_intra_pred(row_rec, p_cu, p_cu->intra_pred_modes[blockidx], ctu_x + tus[blockidx].x, ctu_y + tus[blockidx].y, tus[blockidx].w, tus[blockidx].h); if (p_cu->i_cbp & (1 << blockidx)) { davs2_get_recons(row_rec, p_cu, blockidx, &tus[blockidx], ctu_x, ctu_y); } } } /* 2, chroma component prediction */ if (h->i_chroma_format == CHROMA_420) { davs2_get_intra_pred_chroma(row_rec, p_cu, ctu_c_x, ctu_c_y); } } else { /* inter cu */ /* 1, prediction (including luma and chroma) */ if (davs2_get_inter_pred(h, row_rec, p_cu, ctu_x, ctu_y) < 0) { return -1; } /* 2, luma residual decoding */ if (p_cu->i_trans_size == TU_SPLIT_NON) { if (p_cu->i_cbp & 0x0F) { davs2_get_recons(row_rec, p_cu, 0, &tus[0], ctu_x, ctu_y); } } else { for (blockidx = 0; blockidx < 4; blockidx++) { if (p_cu->i_cbp & (1 << blockidx)) { davs2_get_recons(row_rec, p_cu, blockidx, &tus[blockidx], ctu_x, ctu_y); } } } } /* 3, chroma residual decoding */ if (h->i_chroma_format == CHROMA_420) { cb_t cur_cb; cur_cb.w = cur_cb.h = 1 << (p_cu->i_cu_level - 1); cur_cb.y = 1 << p_cu->i_cu_level; cur_cb.x = 0; if (p_cu->i_cbp & (1 << 4)) { davs2_get_recons(row_rec, p_cu, 4, &cur_cb, ctu_x, ctu_y); } cur_cb.x = (int8_t)cur_cb.h; if (p_cu->i_cbp & (1 << 5)) { davs2_get_recons(row_rec, p_cu, 5, &cur_cb, ctu_x, ctu_y); } } return 0; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void copy_lcu_col1(pel_t *dst, pel_t *src, const int height, const int stride) { int i, k; for (i = 0, k = 0; i < height; i++, k += stride) { dst[k] = src[k]; } } /* --------------------------------------------------------------------------- */ void decode_lcu_init(davs2_t *h, int i_lcu_x, int i_lcu_y) { const int num_in_scu = 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); const int width_in_scu = h->i_width_in_scu; int lcu_w_in_scu, lcu_h_in_scu; int i, j; assert(h->lcu.i_scu_xy >= 0 && h->lcu.i_scu_xy < h->i_size_in_scu); // update coordinates of the current coding unit h->lcu.i_scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); h->lcu.i_scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); h->lcu.i_scu_xy = h->lcu.i_scu_y * width_in_scu + h->lcu.i_scu_x; h->lcu.i_spu_x = h->lcu.i_scu_x * BLOCK_MULTIPLE; // luma block position h->lcu.i_spu_y = h->lcu.i_scu_y * BLOCK_MULTIPLE; // luma block position h->lcu.i_pix_x = h->lcu.i_scu_x << MIN_CU_SIZE_IN_BIT; // luma pixel position h->lcu.i_pix_y = h->lcu.i_scu_y << MIN_CU_SIZE_IN_BIT; // luma coding unit position h->lcu.i_pix_c_x = h->lcu.i_scu_x << (MIN_CU_SIZE_IN_BIT - 1); // chroma pixel position if (h->i_chroma_format == CHROMA_420) { h->lcu.i_pix_c_y = h->lcu.i_scu_y << (MIN_CU_SIZE_IN_BIT - 1); // chroma coding unit position } // actual width and height (in pixel) for current lcu lcu_w_in_scu = DAVS2_MIN((h->i_width - h->lcu.i_pix_x) >> MIN_CU_SIZE_IN_BIT, num_in_scu); lcu_h_in_scu = DAVS2_MIN((h->i_height - h->lcu.i_pix_y) >> MIN_CU_SIZE_IN_BIT, num_in_scu); h->lcu.i_pix_width = lcu_w_in_scu << MIN_CU_SIZE_IN_BIT; h->lcu.i_pix_height = lcu_h_in_scu << MIN_CU_SIZE_IN_BIT; // init slice index of current LCU for (i = 0; i < lcu_h_in_scu; i++) { cu_t *p_cu_iter = &h->scu_data[h->lcu.i_scu_xy + i * width_in_scu]; for (j = 0; j < lcu_w_in_scu; j++) { p_cu_iter->i_slice_nr = (int8_t)h->i_slice_index; p_cu_iter++; } } } /* --------------------------------------------------------------------------- */ void rowrec_lcu_init(davs2_t *h, davs2_row_rec_t *row_rec, int i_lcu_x, int i_lcu_y) { #if CTRL_AEC_THREAD row_rec->p_rec_info = &row_rec->lcu_info->rec_info; #else row_rec->p_rec_info = &h->lcu.rec_info; #endif row_rec->idx_cu_zscan = 0; /* CTU position */ row_rec->ctu.i_pix_x = i_lcu_x << h->i_lcu_level; row_rec->ctu.i_pix_y = i_lcu_y << h->i_lcu_level; row_rec->ctu.i_pix_x_c = i_lcu_x << (h->i_lcu_level - 1); row_rec->ctu.i_pix_y_c = i_lcu_y << (h->i_lcu_level - 1); row_rec->ctu.i_ctu_w = DAVS2_MIN(h->i_width - row_rec->ctu.i_pix_x, 1 << h->i_lcu_level); row_rec->ctu.i_ctu_h = DAVS2_MIN(h->i_height - row_rec->ctu.i_pix_y, 1 << h->i_lcu_level); row_rec->ctu.i_ctu_w_c = row_rec->ctu.i_ctu_w >> 1; row_rec->ctu.i_ctu_h_c = row_rec->ctu.i_ctu_h >> 1; row_rec->ctu.i_scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); row_rec->ctu.i_scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); row_rec->ctu.i_scu_xy = row_rec->ctu.i_scu_y * h->i_width_in_scu + row_rec->ctu.i_scu_x; row_rec->ctu.i_spu_x = row_rec->ctu.i_scu_x * BLOCK_MULTIPLE; // luma block position row_rec->ctu.i_spu_y = row_rec->ctu.i_scu_y * BLOCK_MULTIPLE; // luma block position /* init pointers */ row_rec->h = h; row_rec->ctu.i_frec[0] = h->fdec->i_stride[0]; row_rec->ctu.i_frec[1] = h->fdec->i_stride[1]; row_rec->ctu.i_frec[2] = h->fdec->i_stride[2]; row_rec->ctu.p_frec[0] = h->fdec->planes[0] + row_rec->ctu.i_pix_y * row_rec->ctu.i_frec[0] + row_rec->ctu.i_pix_x; row_rec->ctu.p_frec[1] = h->fdec->planes[1] + row_rec->ctu.i_pix_y_c * row_rec->ctu.i_frec[1] + row_rec->ctu.i_pix_x_c; row_rec->ctu.p_frec[2] = h->fdec->planes[2] + row_rec->ctu.i_pix_y_c * row_rec->ctu.i_frec[2] + row_rec->ctu.i_pix_x_c; #if 1 row_rec->ctu.i_fdec[0] = h->fdec->i_stride[0]; row_rec->ctu.i_fdec[1] = h->fdec->i_stride[1]; row_rec->ctu.i_fdec[2] = h->fdec->i_stride[2]; row_rec->ctu.p_fdec[0] = h->fdec->planes[0] + row_rec->ctu.i_pix_y * row_rec->ctu.i_fdec[0] + row_rec->ctu.i_pix_x; row_rec->ctu.p_fdec[1] = h->fdec->planes[1] + row_rec->ctu.i_pix_y_c * row_rec->ctu.i_fdec[1] + row_rec->ctu.i_pix_x_c; row_rec->ctu.p_fdec[2] = h->fdec->planes[2] + row_rec->ctu.i_pix_y_c * row_rec->ctu.i_fdec[2] + row_rec->ctu.i_pix_x_c; #else row_rec->ctu.i_fdec[0] = MAX_CU_SIZE; row_rec->ctu.i_fdec[1] = MAX_CU_SIZE; row_rec->ctu.i_fdec[2] = MAX_CU_SIZE; row_rec->ctu.p_fdec[0] = row_rec->fdec_buf; row_rec->ctu.p_fdec[1] = row_rec->fdec_buf + MAX_CU_SIZE * MAX_CU_SIZE; row_rec->ctu.p_fdec[2] = row_rec->fdec_buf + MAX_CU_SIZE * MAX_CU_SIZE + (MAX_CU_SIZE / 2); #endif } /* --------------------------------------------------------------------------- */ int decode_lcu_parse(davs2_t *h, int i_level, int pix_x, int pix_y) { const int width_in_scu = h->i_width_in_scu; const int pix_x_end = pix_x + (1 << i_level); const int pix_y_end = pix_y + (1 << i_level); int b_cu_inside_pic = (pix_x_end <= h->i_width) && (pix_y_end <= h->i_height); int split_flag = (i_level != MIN_CU_SIZE_IN_BIT); assert((pix_x < h->i_width) && (pix_y < h->i_height)); if (i_level > MIN_CU_SIZE_IN_BIT && b_cu_inside_pic) { split_flag = aec_read_split_flag(&h->aec, i_level); } if (split_flag) { int i_level_next = i_level - 1; int i; for (i = 0; i < 4; i++) { int sub_pix_x = pix_x + ((i & 1) << i_level_next); int sub_pix_y = pix_y + ((i >> 1) << i_level_next); if (sub_pix_x < h->i_width && sub_pix_y < h->i_height) { decode_lcu_parse(h, i_level_next, sub_pix_x, sub_pix_y); } } } else { int i_cu_x = (pix_x >> MIN_CU_SIZE_IN_BIT); int i_cu_y = (pix_y >> MIN_CU_SIZE_IN_BIT); int i_cu_xy = i_cu_y * width_in_scu + i_cu_x; cu_t *p_cu = &h->scu_data[i_cu_xy]; h->lcu.idx_cu_zscan_aec = tab_b8xy_to_zigzag[i_cu_y - h->lcu.i_scu_y][i_cu_x - h->lcu.i_scu_x]; if (cu_read_info(h, p_cu, i_level, i_cu_xy, pix_x, pix_y) < 0) { p_cu->i_slice_nr = -1; // set an invalid value to terminate the reconstruction return -1; } } return 0; } /* --------------------------------------------------------------------------- */ int decode_lcu_recon(davs2_t *h, davs2_row_rec_t *row_rec, int i_level, int pix_x, int pix_y) { const int width_in_scu = h->i_width_in_scu; int i_cu_x = (pix_x >> MIN_CU_SIZE_IN_BIT); int i_cu_y = (pix_y >> MIN_CU_SIZE_IN_BIT); int i_cu_xy = i_cu_y * width_in_scu + i_cu_x; cu_t *p_cu = &h->scu_data[i_cu_xy]; int split_flag = (p_cu->i_cu_level < i_level); assert((pix_x < h->i_width) && (pix_y < h->i_height)); if (split_flag) { int i_level_next = i_level - 1; int i; for (i = 0; i < 4; i++) { int sub_pix_x = pix_x + ((i & 1) << i_level_next); int sub_pix_y = pix_y + ((i >> 1) << i_level_next); if (sub_pix_x < h->i_width && sub_pix_y < h->i_height) { decode_lcu_recon(h, row_rec, i_level_next, sub_pix_x, sub_pix_y); } } } else { int i_cu_mask = h->i_lcu_size_sub1 >> MIN_CU_SIZE_IN_BIT; row_rec->idx_cu_zscan = tab_b8xy_to_zigzag[i_cu_y & i_cu_mask][i_cu_x & i_cu_mask]; if (p_cu->i_slice_nr == -1) { h->decoding_error = 1; davs2_log(h, DAVS2_LOG_WARNING, "invalid CU (%3d, %3d), POC %3d", pix_x, pix_y, h->i_poc); } cu_recon(h, row_rec, p_cu, pix_x, pix_y); } return 0; } davs2-1.6/source/common/cu.h000066400000000000000000000061711337322544400157630ustar00rootroot00000000000000/* * cu.h * * Description of this file: * CU Processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_CU_H #define DAVS2_CU_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- * init LCU decoding * \input param * h : decoder handler * i_lcu_x : LCU position index * i_lcu_y : LCU position index */ #define decode_lcu_init FPFX(decode_lcu_init) void decode_lcu_init (davs2_t *h, int i_lcu_x, int i_lcu_y); #define rowrec_lcu_init FPFX(rowrec_lcu_init) void rowrec_lcu_init (davs2_t *h, davs2_row_rec_t *row_rec, int i_lcu_x, int i_lcu_y); /* --------------------------------------------------------------------------- * process LCU entropy decoding (recursively) * \input param * h : decoder handler * i_level : log2(CU size) * pix_x : pixel position of the decoding CU in the frame in Luma component * pix_y : pixel position of the decoding CU in the frame in Luma component */ #define decode_lcu_parse FPFX(decode_lcu_parse) int decode_lcu_parse(davs2_t *h, int i_level, int pix_x, int pix_y); /* --------------------------------------------------------------------------- * process LCU reconstruction (recursively) * \input param * h : decoder handler * i_level : log2(CU size) * pix_x : pixel position of the decoding CU in the frame in Luma component * pix_y : pixel position of the decoding CU in the frame in Luma component */ #define decode_lcu_recon FPFX(decode_lcu_recon) int decode_lcu_recon(davs2_t *h, davs2_row_rec_t *row_rec, int i_level, int pix_x, int pix_y); #define decoder_wait_lcu_row FPFX(decoder_wait_lcu_row) void decoder_wait_lcu_row(davs2_t *h, davs2_frame_t *frame, int max_y_in_pic); #define decoder_wait_row FPFX(decoder_wait_row) void decoder_wait_row(davs2_t *h, davs2_frame_t *frame, int max_y_in_pic); #ifdef __cplusplus } #endif #endif // DAVS2_CU_H davs2-1.6/source/common/davs2.cc000066400000000000000000000606571337322544400165420ustar00rootroot00000000000000/* * davs2.cc * * Description of this file: * API functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "davs2.h" #include "primitives.h" #include "decoder.h" #include "bitstream.h" #include "header.h" #include "version.h" #include "decoder.h" #include "frame.h" #include "cpu.h" #include "threadpool.h" #include "version.h" /** * =========================================================================== * macro defines * =========================================================================== */ #if DAVS2_TRACE_API FILE *fp_trace_bs = NULL; FILE *fp_trace_in = NULL; #endif /** * =========================================================================== * function defines * =========================================================================== */ /* -------------------------------------------------------------------------- */ static es_unit_t * es_unit_alloc(int buf_size) { es_unit_t *es_unit = NULL; int bufsize = sizeof(es_unit_t) + buf_size; bufsize = ((bufsize + 31) >> 5 ) << 5; es_unit = (es_unit_t *)davs2_malloc(bufsize); if (es_unit == NULL) { davs2_log(NULL, DAVS2_LOG_ERROR, "failed to malloc memory in es_unit_alloc.\n"); return NULL; } es_unit->size = buf_size; es_unit->len = 0; es_unit->pts = 0; es_unit->dts = 0; return es_unit; } /* -------------------------------------------------------------------------- */ static void es_unit_free(es_unit_t *es_unit) { if (es_unit) { davs2_free(es_unit); } } /* --------------------------------------------------------------------------- * push byte stream data of one frame to input list */ static es_unit_t *davs2_pack_es_unit(davs2_mgr_t *mgr, const uint8_t *data, int len, int64_t pts, int64_t dts) { #define DAVS2_ISUNIT(x) ((x) == 0xB0 || (x) == 0xB1 || (x) == 0xB7 || (x) == 0xB3 || (x) == 0xB6) es_unit_t *es_unit = NULL; es_unit_t *ret_es_unit = NULL; int start_code = data[3]; if (mgr->es_unit == NULL) { mgr->es_unit = (es_unit_t *)xl_remove_head(&mgr->packets_idle, 1); } es_unit = mgr->es_unit; if (len > 0) { if (es_unit->size < es_unit->len + len) { /* reallocate frame buffer */ int new_size = es_unit->len + len + MAX_ES_FRAME_SIZE * 2; es_unit_t *new_es_unit; if ((new_es_unit = es_unit_alloc(new_size)) == NULL) { return NULL; } memcpy(new_es_unit, es_unit, sizeof(es_unit_t)); /* copy ES Unit information */ memcpy(new_es_unit->data, es_unit->data, es_unit->len * sizeof(uint8_t)); es_unit_free(es_unit); mgr->es_unit = es_unit = new_es_unit; } /* copy stream data */ if (DAVS2_ISUNIT(start_code) && es_unit->len > 0) { ret_es_unit = es_unit; /* fetch a node again from idle list */ es_unit = (es_unit_t *)xl_remove_head(&mgr->packets_idle, 1); mgr->es_unit = es_unit; } memcpy(es_unit->data + es_unit->len, data, len * sizeof(uint8_t)); es_unit->len += len; es_unit->pts = pts; es_unit->dts = dts; } /* check the pseudo start code */ if (ret_es_unit != NULL) { ret_es_unit->len = bs_dispose_pseudo_code(ret_es_unit->data, ret_es_unit->data, ret_es_unit->len); } #undef DAVS2_ISUNIT return ret_es_unit; } /* --------------------------------------------------------------------------- */ static void destroy_all_lists(davs2_mgr_t *mgr) { es_unit_t *es_unit = NULL; davs2_picture_t *pic = NULL; /* idle list */ for (;;) { if ((es_unit = (es_unit_t *)xl_remove_head_ex(&mgr->packets_idle)) == NULL) { break; } es_unit_free(es_unit); } /* recycle list */ for (;;) { if ((pic = (davs2_picture_t *)xl_remove_head_ex(&mgr->pic_recycle)) == NULL) { break; } davs2_free(pic); } if (mgr->es_unit) { es_unit_free(mgr->es_unit); mgr->es_unit = NULL; } xl_destroy(&mgr->packets_idle); xl_destroy(&mgr->pic_recycle); } /* --------------------------------------------------------------------------- */ static int create_all_lists(davs2_mgr_t *mgr) { es_unit_t *es_unit = NULL; int i; if (xl_init(&mgr->packets_idle ) != 0 || xl_init(&mgr->pic_recycle ) != 0) { goto fail; } for (i = 0; i < MAX_ES_FRAME_NUM + mgr->param.threads; i++) { es_unit = es_unit_alloc(MAX_ES_FRAME_SIZE); if (es_unit) { xl_append(&mgr->packets_idle, es_unit); } else { goto fail; } } return 0; fail: destroy_all_lists(mgr); return -1; } /* --------------------------------------------------------------------------- */ static void output_list_recycle_picture(davs2_mgr_t *mgr, davs2_outpic_t *pic) { pic->frame = NULL; /* picture may be obsolete(for new sequence with different resolution), we will release it later */ xl_append(&mgr->pic_recycle, pic); } /* --------------------------------------------------------------------------- */ static int has_new_output_frame(davs2_mgr_t *mgr, davs2_t *h) { // TODO: ƣȷǰͼϺǷӦõȴ UNUSED_PARAMETER(mgr); UNUSED_PARAMETER(h); return 1; // ͼط㣬ͼ0 } /* --------------------------------------------------------------------------- */ static davs2_outpic_t *output_list_get_one_output_picture(davs2_mgr_t *mgr) { davs2_outpic_t *pic = NULL; davs2_thread_mutex_lock(&mgr->mutex_mgr); while (mgr->outpics.pics) { davs2_frame_t *frame = mgr->outpics.pics->frame; assert(frame); if (frame->i_poc == mgr->outpics.output) { /* the next frame : output */ pic = mgr->outpics.pics; mgr->outpics.pics = pic->next; /* move on to the next frame */ mgr->outpics.output++; mgr->outpics.num_output_pic--; break; } else { /* TODO: Ҫȷһ޸ķʽ * α֤˳ЧԣҪɶ֡ʱ */ if (frame->i_poc > mgr->outpics.output) { /* the end of the stream occurs */ if (mgr->b_flushing && mgr->num_frames_in == mgr->num_frames_out + mgr->outpics.num_output_pic) { mgr->outpics.output++; continue; } /* a future frame */ int num_delayed_frames = 1; pic = mgr->outpics.pics; while (pic->next != NULL) { num_delayed_frames++; pic = pic->next; } if (num_delayed_frames < 8) { /* keep waiting */ davs2_thread_mutex_unlock(&mgr->mutex_mgr); davs2_sleep_ms(1); davs2_thread_mutex_lock(&mgr->mutex_mgr); continue; } } /* ĿǰеСPOCPOC֮ϴ󣬽POCǰǰСPOC */ davs2_log(mgr, DAVS2_LOG_WARNING, "Advance to discontinuous POC: %d\n", frame->i_poc); mgr->outpics.output = frame->i_poc; } } mgr->outpics.busy = (pic != NULL); davs2_thread_mutex_unlock(&mgr->mutex_mgr); return pic; } /* -------------------------------------------------------------------------- * Thread of decoder output (decoded raw data) */ int decoder_get_output(davs2_mgr_t *mgr, davs2_seq_info_t *headerset, davs2_picture_t *out_frame, int is_flush) { davs2_outpic_t *pic = NULL; int b_wait_new_frame = mgr->num_frames_in + mgr->num_decoders - mgr->num_frames_out > 8 + mgr->num_aec_thread; while (mgr->num_frames_in > mgr->num_frames_out && /* no more output */ (b_wait_new_frame || is_flush)) { if (mgr->new_sps) { memcpy(headerset, &mgr->seq_info.head, sizeof(davs2_seq_info_t)); mgr->new_sps = FALSE; /* set flag */ out_frame->magic = NULL; return DAVS2_GOT_HEADER; } /* check for the next frame */ pic = output_list_get_one_output_picture(mgr); if (pic == NULL) { davs2_sleep_ms(1); } else { break; } } if (pic == NULL) { if (mgr->new_sps) { memcpy(headerset, &mgr->seq_info.head, sizeof(davs2_seq_info_t)); mgr->new_sps = FALSE; /* set flag */ out_frame->magic = NULL; return DAVS2_GOT_HEADER; } return DAVS2_DEFAULT; } mgr->num_frames_out++; /* copy out */ davs2_write_a_frame(pic->pic, pic->frame); /* release reference when it would no more be needed */ if (pic->pic->dec_frame == NULL) { release_one_frame(pic->frame); } /* deliver this frame */ memcpy(out_frame, pic->pic, sizeof(davs2_picture_t)); out_frame->magic = pic; return DAVS2_GOT_FRAME; } /** * --------------------------------------------------------------------------- * Function : release one output frame * Parameters : * [in] : decoder - decoder handle * : out_frame - frame to recycle * Return : none * --------------------------------------------------------------------------- */ DAVS2_API void davs2_decoder_frame_unref(void *decoder, davs2_picture_t *out_frame) { davs2_mgr_t *mgr = (davs2_mgr_t *)decoder; if (mgr == NULL || out_frame == NULL) { return; } /* release the output */ if (out_frame->magic != NULL) { davs2_outpic_t *pic = (davs2_outpic_t *)out_frame->magic; /* release reference when it would no more be needed */ if (pic->pic->dec_frame != NULL) { release_one_frame(pic->frame); // pic->pic->dec_frame == pic->frame pic->pic->dec_frame = NULL; } output_list_recycle_picture(mgr, pic); } } /* -------------------------------------------------------------------------- */ static davs2_t *task_get_free_task(davs2_mgr_t *mgr) { int i; for (; mgr->b_exit == 0;) { for (i = 0; i < mgr->num_decoders; i++) { davs2_t *h = &mgr->decoders[i]; davs2_thread_mutex_lock(&mgr->mutex_mgr); if (h->task_info.task_status == TASK_FREE) { h->task_info.task_status = TASK_BUSY; davs2_thread_mutex_unlock(&mgr->mutex_mgr); return h; } davs2_thread_mutex_unlock(&mgr->mutex_mgr); } } return NULL; } /* -------------------------------------------------------------------------- */ void task_unload_packet(davs2_t *h, es_unit_t *es_unit) { davs2_mgr_t *mgr = h->task_info.taskmgr; if (es_unit) { /* packet is free */ es_unit->len = 0; xl_append(&mgr->packets_idle, es_unit); } davs2_thread_mutex_lock(&mgr->mutex_mgr); h->task_info.task_status = TASK_FREE; davs2_thread_mutex_unlock(&mgr->mutex_mgr); } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ DAVS2_API void * davs2_decoder_open(davs2_param_t *param) { const int max_num_thread = CTRL_AEC_THREAD ? AVS2_THREAD_MAX : AVS2_THREAD_MAX / 2; char buf_cpu[120] = ""; davs2_mgr_t *mgr = NULL; uint8_t *mem_ptr; size_t mem_size; uint32_t cpuid = 0; int i; /* output version information */ if (param->info_level <= DAVS2_LOG_INFO) { davs2_log(NULL, DAVS2_LOG_INFO, "davs2: %s.%d, %s", XVERSION_STR, BIT_DEPTH, XBUILD_TIME); } #if DAVS2_TRACE_API fp_trace_bs = fopen("trace_bitstream.avs", "wb"); fp_trace_in = fopen("trace_input.txt", "w"); #endif /* check parameters */ if (param == NULL) { davs2_log(NULL, DAVS2_LOG_ERROR, "Invalid input parameters: Null parameters\n"); return 0; } /* init all function handlers */ #if HAVE_MMX cpuid = davs2_cpu_detect(); if (param->disable_avx) { cpuid &= ~(DAVS2_CPU_AVX | DAVS2_CPU_AVX2); } #endif init_all_primitives(cpuid); /* CPU capacities */ davs2_get_simd_capabilities(buf_cpu, cpuid); if (param->info_level <= DAVS2_LOG_INFO) { davs2_log(mgr, DAVS2_LOG_INFO, "CPU Capabilities: %s", buf_cpu); } mem_size = sizeof(davs2_mgr_t) + CACHE_LINE_SIZE + AVS2_THREAD_MAX * (sizeof(davs2_t) + CACHE_LINE_SIZE); CHECKED_MALLOCZERO(mem_ptr, uint8_t *, mem_size); mgr = (davs2_mgr_t *)mem_ptr; mem_ptr += sizeof(davs2_mgr_t); ALIGN_POINTER(mem_ptr); memcpy(&mgr->param, param, sizeof(davs2_param_t)); /* init log module */ mgr->module_log.i_log_level = param->info_level; sprintf(mgr->module_log.module_name, "Manager %06llx", (intptr_t)(mgr)); if (mgr->param.threads <= 0) { mgr->param.threads = davs2_cpu_num_processors(); } if (mgr->param.threads > max_num_thread) { mgr->param.threads = max_num_thread; davs2_log(mgr, DAVS2_LOG_WARNING, "Max number of thread reached, forcing to be %d\n", max_num_thread); } /* init members that could not be zero */ mgr->i_prev_coi = -1; /* output pictures */ mgr->outpics.output = -1; mgr->outpics.pics = NULL; mgr->outpics.num_output_pic = 0; mgr->num_decoders = mgr->param.threads; mgr->num_total_thread = mgr->param.threads; mgr->num_aec_thread = mgr->param.threads; #if CTRL_AEC_THREAD if (mgr->num_total_thread > 3) { mgr->num_aec_thread = (mgr->param.threads >> 1) + 1; mgr->num_rec_thread = mgr->num_total_thread - mgr->num_aec_thread; } else { mgr->num_rec_thread = 0; } mgr->num_decoders += 1 + mgr->num_aec_thread; #else mgr->num_rec_thread = 0; #endif mgr->num_decoders++; mgr->decoders = (davs2_t *)mem_ptr; mem_ptr += AVS2_THREAD_MAX * sizeof(davs2_t); ALIGN_POINTER(mem_ptr); davs2_thread_mutex_init(&mgr->mutex_mgr, NULL); davs2_thread_mutex_init(&mgr->mutex_aec, NULL); /* init input&output lists */ if (create_all_lists(mgr) < 0) { goto fail; } /* ߳ò */ if (mgr->num_total_thread < 1 || mgr->num_decoders < mgr->num_aec_thread || mgr->num_rec_thread < 0 || mgr->num_aec_thread < 1 || mgr->num_aec_thread > mgr->num_total_thread) { davs2_log(mgr, DAVS2_LOG_ERROR, "Invalid thread number configuration: num_task[%d], num_threads[%d], num_aec_thread[%d], num_pool[%d]\n", mgr->num_decoders, mgr->num_total_thread, mgr->num_aec_thread, mgr->num_rec_thread); goto fail; } /* spawn the output thread */ mgr->num_frames_in = 0; mgr->num_frames_out = 0; /* init all the tasks */ for (i = 0; i < mgr->num_decoders; i++) { davs2_t *h = &mgr->decoders[i]; /* init the decode context */ decoder_open(mgr, h, i); // davs2_log(h, DAVS2_LOG_WARNING, "Decoder [%2d]: %p", i, h); h->task_info.task_id = i; h->task_info.task_status = TASK_FREE; h->task_info.taskmgr = mgr; } /* initialize thread pool for AEC decoding and reconstruction */ davs2_threadpool_init((davs2_threadpool_t **)&mgr->thread_pool, mgr->num_total_thread, NULL, NULL, 0); davs2_log(mgr, DAVS2_LOG_INFO, "using %d thread(s): %d(frame/AEC)+%d(pool/REC), %d tasks", mgr->num_total_thread, mgr->num_aec_thread, mgr->num_rec_thread, mgr->num_decoders); return mgr; fail: davs2_log(NULL, DAVS2_LOG_ERROR, "failed to open decoder\n"); davs2_decoder_close(mgr); return NULL; } /* --------------------------------------------------------------------------- */ int decoder_decode_es_unit(davs2_mgr_t *mgr, es_unit_t *es_unit) { davs2_t *h = NULL; int b_wait_output = 0; /* decode this frame * (1) init bs */ bs_init(&es_unit->bs, es_unit->data, es_unit->len); h = task_get_free_task(mgr); mgr->h_dec = h; davs2_thread_mutex_lock(&mgr->mutex_aec); h->task_info.curr_es_unit = es_unit; /* record the ES_unit to be decoded */ /* (2) parse header */ if (parse_header(h, &es_unit->bs) == 0) { h->p_bs = &es_unit->bs; /* TODO: ͼͷϢȷǰʱǷҪͼ */ /* prepare the reference list and the reconstruction buffer */ if (task_get_references(h, es_unit->pts, es_unit->dts) == 0) { b_wait_output = has_new_output_frame(mgr, h); mgr->num_frames_in++; /* */ davs2_thread_mutex_unlock(&mgr->mutex_aec); /* decode picture data */ davs2_threadpool_run((davs2_threadpool_t *)mgr->thread_pool, decoder_decode_picture_data, h, 0, 0); } } else { davs2_thread_mutex_unlock(&mgr->mutex_aec); /* task is free */ task_unload_packet(h, es_unit); } return b_wait_output; } /* --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_send_packet(void *decoder, davs2_packet_t *packet) { davs2_mgr_t *mgr = (davs2_mgr_t *)decoder; es_unit_t *es_unit = NULL; int ret_type = DAVS2_DEFAULT; #if DAVS2_TRACE_API if (fp_trace_bs != NULL && packet->len > 0) { fwrite(packet->data, packet->len, 1, fp_trace_bs); fflush(fp_trace_bs); } if (fp_trace_in) { fprintf(fp_trace_in, "%4d\t%d", packet->len, packet->marker); fflush(fp_trace_in); } #endif /* check the input parameter: packet */ if (packet == NULL || packet->data == NULL || packet->len <= 0) { davs2_log(mgr->decoders, DAVS2_LOG_DEBUG, "Null input packet"); return DAVS2_ERROR; /* error */ } /* check packet length */ if (packet->len < 4) { davs2_log(mgr, DAVS2_LOG_DEBUG, "Invalid packet, 4 bytes are needed for one packet (including start_code). Len = %d", packet->len); return DAVS2_ERROR; /* error */ } /* check the first 3 bytes are START_CODE */ if (packet->data[0] != 0x00 || packet->data[1] != 0x00 || packet->data[2] != 0x01) { davs2_log(mgr, DAVS2_LOG_ERROR, "Invalid input Byte-Stream, not start code: %02x%02x%02x", packet->data[0], packet->data[1], packet->data[2]); return DAVS2_ERROR; } /* generate one es_unit for current byte-stream buffer */ es_unit = davs2_pack_es_unit(mgr, packet->data, packet->len, packet->pts, packet->dts); if (es_unit == NULL && mgr->es_unit == NULL) { davs2_log(mgr, DAVS2_LOG_ERROR, "Failed to create an ES_UNIT, input Byte-Stream length %d", packet->len); return DAVS2_ERROR; } else if (es_unit == NULL) { // davs2_log(mgr, DAVS2_LOG_DEBUG, "Buffered byte-stream length: %d", // packet->len); return DAVS2_DEFAULT; } /* decode one frame */ mgr->num_frames_to_output += decoder_decode_es_unit(mgr, es_unit); #if DAVS2_TRACE_API if (fp_trace_in) { fprintf(fp_trace_in, "\t%8d\t%2d\t%4d\t%3d\t%3d\n", packet->len, ret_type, out_frame->pic_order_count, mgr->num_frames_in, mgr->num_frames_out); fflush(fp_trace_in); } #endif return ret_type; } /* --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_recv_frame(void *decoder, davs2_seq_info_t *headerset, davs2_picture_t *out_frame) { davs2_mgr_t *mgr = (davs2_mgr_t *)decoder; int ret_type = DAVS2_DEFAULT; /* clear output frame data */ out_frame->magic = NULL; /* get one frame or sequence header */ if (mgr->num_frames_to_output || mgr->new_sps) { ret_type = decoder_get_output(mgr, headerset, out_frame, 0); if (ret_type == DAVS2_GOT_FRAME) { mgr->num_frames_to_output--; } } return ret_type; } /* --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_flush(void *decoder, davs2_seq_info_t *headerset, davs2_picture_t *out_frame) { davs2_mgr_t *mgr = (davs2_mgr_t *)decoder; int ret; #if DAVS2_TRACE_API if (fp_trace_in) { fprintf(fp_trace_in, "Flush 0x%p ", decoder); fflush(fp_trace_in); } #endif if (decoder == NULL) { return DAVS2_ERROR; } mgr->b_flushing = 1; // label the decoder being flushing out_frame->magic = NULL; ret = DAVS2_DEFAULT; #if DAVS2_TRACE_API if (fp_trace_in) { fprintf(fp_trace_in, "Fetch "); fflush(fp_trace_in); } #endif // flush buffered bit-stream if (mgr->es_unit != NULL && mgr->es_unit->len >= 4) { es_unit_t *es_unit = mgr->es_unit; mgr->es_unit = NULL; decoder_decode_es_unit(mgr, es_unit); } ret = decoder_get_output(mgr, headerset, out_frame, 1); #if DAVS2_TRACE_API if (fp_trace_in) { fprintf(fp_trace_in, "Ret %d, %3d\t%3d\n", ret, mgr->num_frames_in, mgr->num_frames_out); fflush(fp_trace_in); } #endif if (ret != DAVS2_DEFAULT) { return ret; } else { return DAVS2_END; } } /* --------------------------------------------------------------------------- */ DAVS2_API void davs2_decoder_close(void *decoder) { davs2_mgr_t *mgr = (davs2_mgr_t *)decoder; int i; #if DAVS2_TRACE_API if (fp_trace_in != NULL) { fprintf(fp_trace_in, "Close 0x%p\n", decoder); fflush(fp_trace_in); } #endif if (mgr == NULL) { return; } /* signal all decoding threads and the output thread to exit */ mgr->b_exit = 1; /* destroy thread pool */ if (mgr->num_total_thread != 0) { davs2_threadpool_delete((davs2_threadpool_t *)mgr->thread_pool); } /* close every task */ for (i = 0; i < mgr->num_decoders; i++) { davs2_t *h = &mgr->decoders[i]; /* free all resources of the decoder */ decoder_close(h); } destroy_all_lists(mgr); /* free all lists */ destroy_dpb(mgr); /* free dpb */ /* destroy the mutex */ davs2_thread_mutex_destroy(&mgr->mutex_mgr); davs2_thread_mutex_destroy(&mgr->mutex_aec); /* free memory */ davs2_free(mgr); /* free the mgr */ #if DAVS2_TRACE_API if (fp_trace_bs != NULL) { fclose(fp_trace_bs); fp_trace_bs = NULL; } if (fp_trace_in != NULL) { fclose(fp_trace_in); fp_trace_in = NULL; } #endif } davs2-1.6/source/common/deblock.cc000066400000000000000000000612401337322544400171130ustar00rootroot00000000000000/* * deblock.cc * * Description of this file: * Deblock functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "deblock.h" #include "quant.h" #if HAVE_MMX #include "vec/intrinsic.h" #endif /* --------------------------------------------------------------------------- */ static const uint8_t ALPHA_TABLE[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 22, 24, 26, 28, 30, 33, 33, 35, 35, 36, 37, 37, 39, 39, 42, 44, 46, 48, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 }; /* --------------------------------------------------------------------------- */ static const uint8_t BETA_TABLE[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27 }; /* --------------------------------------------------------------------------- */ extern const uint8_t QP_SCALE_CR[64]; /* --------------------------------------------------------------------------- * edge direction for deblock */ enum edge_direction_e { EDGE_HOR = 1, /* horizontal */ EDGE_VER = 0 /* vertical */ }; /* --------------------------------------------------------------------------- * edge type for fitler control */ enum edge_type_e { EDGE_TYPE_NOFILTER = 0, /* no deblock filter */ EDGE_TYPE_ONLY_LUMA = 1, /* TU boundary in CU (chroma block does not have such boundaries) */ EDGE_TYPE_BOTH = 2 /* CU boundary and PU boundary */ }; /* --------------------------------------------------------------------------- */ static void lf_set_edge_filter_param(davs2_t *h, int i_level, int scu_x, int scu_y, int dir, int edge_type) { const int w_in_scu = h->i_width_in_scu; // const int h_in_scu = h->i_height_in_mincu; int scu_num = 1 << (i_level - MIN_CU_SIZE_IN_BIT); int scu_xy = scu_y * w_in_scu + scu_x; int i; if (dir == EDGE_VER) { /* set flag of vertical edges */ if (scu_x == 0) { return; } /* Is left border Slice border? * check edge condition, can not filter beyond frame/slice boundaries */ if (!h->seq_info.cross_loop_filter_flag && h->scu_data[scu_xy].i_slice_nr != h->scu_data[scu_xy - 1].i_slice_nr) { return; } /* set filter type */ // scu_num = DAVS2_MIN(scu_num, h_in_scu - scu_y); for (i = 0; i < scu_num; i++) { if (h->p_deblock_flag[EDGE_VER][(scu_y + i) * w_in_scu + scu_x] != EDGE_TYPE_NOFILTER) { break; } h->p_deblock_flag[EDGE_VER][(scu_y + i) * w_in_scu + scu_x] = (uint8_t)edge_type; } } else { /* set flag of horizontal edges */ if (scu_y == 0) { return; } /* Is top border Slice border? * check edge condition, can not filter beyond frame/slice boundaries */ if (!h->seq_info.cross_loop_filter_flag && h->scu_data[scu_xy].i_slice_nr != h->scu_data[scu_xy - h->i_width_in_scu].i_slice_nr) { return; } /* set filter type */ // scu_num = DAVS2_MIN(scu_num, w_in_scu - scu_x); for (i = 0; i < scu_num; i++) { if (h->p_deblock_flag[EDGE_HOR][scu_y * w_in_scu + scu_x + i] != EDGE_TYPE_NOFILTER) { break; } h->p_deblock_flag[EDGE_HOR][scu_y * w_in_scu + scu_x + i] = (uint8_t)edge_type; } } } /* --------------------------------------------------------------------------- */ static void lf_lcu_set_edge_filter(davs2_t *h, int i_level, int scu_x, int scu_y) { const int w_in_scu = h->i_width_in_scu; cu_t *p_scu_data = &h->scu_data[scu_y * w_in_scu + scu_x]; int i; if (p_scu_data->i_cu_level < i_level) { const int h_in_scu = h->i_height_in_scu; // 4 sub-cu for (i = 0; i < 4; i++) { int sub_cu_x = scu_x + ((i & 1) << (i_level - MIN_CU_SIZE_IN_BIT - 1)); int sub_cu_y = scu_y + ((i >> 1) << (i_level - MIN_CU_SIZE_IN_BIT - 1)); if (sub_cu_x >= w_in_scu || sub_cu_y >= h_in_scu) { continue; // is outside of the frame } lf_lcu_set_edge_filter(h, i_level - 1, sub_cu_x, sub_cu_y); } } else { // set the first left and top edge filter parameters lf_set_edge_filter_param(h, i_level, scu_x, scu_y, EDGE_VER, EDGE_TYPE_BOTH); // left edge lf_set_edge_filter_param(h, i_level, scu_x, scu_y, EDGE_HOR, EDGE_TYPE_BOTH); // top edge // set other edge filter parameters if (p_scu_data->i_cu_level > B8X8_IN_BIT) { /* set prediction boundary */ i = i_level - MIN_CU_SIZE_IN_BIT - 1; switch (p_scu_data->i_cu_type) { case PRED_2NxN: lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_BOTH); break; case PRED_Nx2N: lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_BOTH); break; case PRED_I_NxN: lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_BOTH); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_BOTH); break; case PRED_I_2Nxn: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 2, EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 3, EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } break; case PRED_I_nx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 2, scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 3, scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } break; case PRED_2NxnU: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_BOTH); } break; case PRED_2NxnD: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)) * 3, EDGE_HOR, EDGE_TYPE_BOTH); } break; case PRED_nLx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_BOTH); } break; case PRED_nRx2N: if (i > 0) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)) * 3, scu_y, EDGE_VER, EDGE_TYPE_BOTH); } break; default: // for other modes: direct/skip, 2Nx2N inter, 2Nx2N intra, no need to set break; } /* set transform block boundary */ if (p_scu_data->i_cu_type != PRED_I_NxN && p_scu_data->i_trans_size != TU_SPLIT_NON && p_scu_data->i_cbp != 0) { if (h->seq_info.enable_nsqt && IS_HOR_PU_PART(p_scu_data->i_cu_type)) { if (p_scu_data->i_cu_level == B16X16_IN_BIT) { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << (i )) + (1 << (i - 1)), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } } else if (h->seq_info.enable_nsqt && IS_VER_PU_PART(p_scu_data->i_cu_type)) { if (p_scu_data->i_cu_level == B16X16_IN_BIT) { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x + (1 << (i )) + (1 << (i - 1)), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); } } else { lf_set_edge_filter_param(h, i_level, scu_x + (1 << i), scu_y, EDGE_VER, EDGE_TYPE_ONLY_LUMA); lf_set_edge_filter_param(h, i_level, scu_x, scu_y + (1 << i), EDGE_HOR, EDGE_TYPE_ONLY_LUMA); } } } } } /* --------------------------------------------------------------------------- * return 1 if skip filtering is needed */ static uint8_t lf_skip_filter(davs2_t *h, cu_t *scuP, cu_t *scuQ, int dir, int block_x, int block_y) { if (h->i_frame_type == AVS2_P_SLICE || h->i_frame_type == AVS2_F_SLICE) { const int width_in_spu = h->i_width_in_spu; int pos1 = block_y * width_in_spu + block_x; int pos2 = (block_y - dir) * width_in_spu + (block_x - !dir); int ref1 = h->p_ref_idx[pos1].r[0]; int ref2 = h->p_ref_idx[pos2].r[0]; mv_t mv_1, mv_2; mv_1.v = h->p_tmv_1st[pos1].v; mv_2.v = h->p_tmv_1st[pos2].v; if ((scuP->i_cbp == 0) && (scuQ->i_cbp == 0) && (DAVS2_ABS(mv_1.x - mv_2.x) < 4) && (DAVS2_ABS(mv_1.y - mv_2.y) < 4) && (ref1 != INVALID_REF && ref1 == ref2)) { return 0; } } return 1; } /* --------------------------------------------------------------------------- */ static void lf_edge_core(pel_t *src, int b_chroma, int ptr_inc, int inc1, int alpha, int beta, uint8_t *flt_flag) { int inc2 = inc1 << 1; int inc3 = inc1 + inc2; int abs_delta; int L2, L1, L0, R0, R1, R2; int fs; // fs stands for filtering strength. The larger fs is, the stronger filter is applied. int FlatnessL, FlatnessR; // FlatnessL and FlatnessR describe how flat the curve is of one coding unit int flag; int pel; for (pel = 0; pel < MIN_CU_SIZE; pel++) { L2 = src[-inc3]; L1 = src[-inc2]; L0 = src[-inc1]; R0 = src[ 0]; R1 = src[ inc1]; R2 = src[ inc2]; abs_delta = DAVS2_ABS(R0 - L0); flag = (pel < 4) ? flt_flag[0] : flt_flag[1]; if (flag && (abs_delta < alpha) && (abs_delta > 1)) { FlatnessL = (DAVS2_ABS(L1 - L0) < beta) ? 2 : 0; FlatnessL += (DAVS2_ABS(L2 - L0) < beta); FlatnessR = (DAVS2_ABS(R0 - R1) < beta) ? 2 : 0; FlatnessR += (DAVS2_ABS(R0 - R2) < beta); switch (FlatnessL + FlatnessR) { case 6: fs = 3 + ((R1 == R0) && (L0 == L1)); // ((R1 == R0) && (L0 == L1)) ? 4 : 3; break; case 5: fs = 2 + ((R1 == R0) && (L0 == L1)); // ((R1 == R0) && (L0 == L1)) ? 3 : 2; break; case 4: fs = 1 + (FlatnessL == 2); // (FlatnessL == 2) ? 2 : 1; break; case 3: fs = (DAVS2_ABS(L1 - R1) < beta); break; default: fs = 0; } fs -= (b_chroma && fs > 0); switch (fs) { case 4: src[-inc1] = (pel_t)((L0 + ((L0 + L2) << 3) + L2 + (R0 << 3) + (R2 << 2) + (R2 << 1) + 16) >> 5); // L0 src[-inc2] = (pel_t)(((L0 << 3) - L0 + (L2 << 2) + (L2 << 1) + R0 + (R0 << 1) + 8) >> 4); // L1 src[-inc3] = (pel_t)(((L0 << 2) + L2 + (L2 << 1) + R0 + 4) >> 3); // L2 src[ 0] = (pel_t)((R0 + ((R0 + R2) << 3) + R2 + (L0 << 3) + (L2 << 2) + (L2 << 1) + 16) >> 5); // R0 src[ inc1] = (pel_t)(((R0 << 3) - R0 + (R2 << 2) + (R2 << 1) + L0 + (L0 << 1) + 8) >> 4); // R1 src[ inc2] = (pel_t)(((R0 << 2) + R2 + (R2 << 1) + L0 + 4) >> 3); // R2 break; case 3: src[-inc1] = (pel_t)((L2 + (L1 << 2) + (L0 << 2) + (L0 << 1) + (R0 << 2) + R1 + 8) >> 4); // L0 src[ 0] = (pel_t)((L1 + (L0 << 2) + (R0 << 2) + (R0 << 1) + (R1 << 2) + R2 + 8) >> 4); // R0 src[-inc2] = (pel_t)((L2 * 3 + L1 * 8 + L0 * 4 + R0 + 8) >> 4); src[ inc1] = (pel_t)((R2 * 3 + R1 * 8 + R0 * 4 + L0 + 8) >> 4); break; case 2: src[-inc1] = (pel_t)(((L1 << 1) + L1 + (L0 << 3) + (L0 << 1) + (R0 << 1) + R0 + 8) >> 4); src[ 0] = (pel_t)(((L0 << 1) + L0 + (R0 << 3) + (R0 << 1) + (R1 << 1) + R1 + 8) >> 4); break; case 1: src[-inc1] = (pel_t)((L0 * 3 + R0 + 2) >> 2); src[ 0] = (pel_t)((R0 * 3 + L0 + 2) >> 2); break; default: break; } } src += ptr_inc; // next row or column pel += b_chroma; } } /* --------------------------------------------------------------------------- */ static void deblock_edge_hor(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src, 0, 1, stride, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ static void deblock_edge_ver(pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag) { lf_edge_core(src, 0, stride, 1, alpha, beta, flt_flag); } /* --------------------------------------------------------------------------- */ #if HDR_CHROMA_DELTA_QP static void deblock_edge_ver_c(pel_t *src_u, pel_t *src_v, int stride, int *alpha, int *beta, uint8_t *flt_flag) #else static void deblock_edge_ver_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) #endif { #if HDR_CHROMA_DELTA_QP lf_edge_core(src_u, 1, stride, 1, alpha[0], beta[0], flt_flag); lf_edge_core(src_v, 1, stride, 1, alpha[1], beta[1], flt_flag); #else lf_edge_core(src_u, 1, stride, 1, alpha, beta, flt_flag); lf_edge_core(src_v, 1, stride, 1, alpha, beta, flt_flag); #endif } /* --------------------------------------------------------------------------- */ #if HDR_CHROMA_DELTA_QP static void deblock_edge_hor_c(pel_t *src_u, pel_t *src_v, int stride, int *alpha, int *beta, uint8_t *flt_flag) #else static void deblock_edge_hor_c(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag) #endif { #if HDR_CHROMA_DELTA_QP lf_edge_core(src_u, 1, 1, stride, alpha[0], beta[0], flt_flag); lf_edge_core(src_v, 1, 1, stride, alpha[1], beta[1], flt_flag); #else lf_edge_core(src_u, 1, 1, stride, alpha, beta, flt_flag); lf_edge_core(src_v, 1, 1, stride, alpha, beta, flt_flag); #endif } /* --------------------------------------------------------------------------- * deblock one coding unit */ static void lf_scu_deblock(davs2_t *h, pel_t *p_dec[3], int stride, int stride_c, int scu_x, int scu_y, int dir) { static const int max_qp_deblock = 63; const int scu_xy = scu_y * h->i_width_in_scu + scu_x; cu_t *scuQ = &h->scu_data[scu_xy]; int edge_condition = h->p_deblock_flag[dir][scu_xy]; /* deblock edges */ if (edge_condition != EDGE_TYPE_NOFILTER) { const int shift = h->sample_bit_depth - 8; cu_t *scuP = (dir) ? (scuQ - h->i_width_in_scu) : (scuQ - 1); uint8_t b_filter_flag[2]; int QP; b_filter_flag[0] = lf_skip_filter(h, scuP, scuQ, dir, (scu_x << 1), (scu_y << 1) ); b_filter_flag[1] = lf_skip_filter(h, scuP, scuQ, dir, (scu_x << 1) + dir, (scu_y << 1) + !dir); if (!b_filter_flag[0] && !b_filter_flag[1]) { return; // 8x4˲Ҫúĺ } /* deblock luma edge */ { pel_t *src_y = p_dec[0] + (scu_y << MIN_CU_SIZE_IN_BIT) * stride + (scu_x << MIN_CU_SIZE_IN_BIT); int alpha, beta; QP = ((scuP->i_qp + scuQ->i_qp + 1) >> 1); // average QP of the two blocks /* coded as 10/12 bit, QP is added by (8 * (h->param.sample_bit_depth - 8)) in config file */ alpha = ALPHA_TABLE[DAVS2_CLIP3(0, max_qp_deblock, QP - (shift << 3) + h->i_alpha_offset)] << shift; beta = BETA_TABLE [DAVS2_CLIP3(0, max_qp_deblock, QP - (shift << 3) + h->i_beta_offset )] << shift; gf_davs2.deblock_luma[dir](src_y, stride, alpha, beta, b_filter_flag); } /* deblock chroma edge */ if (edge_condition == EDGE_TYPE_BOTH && h->i_chroma_format != CHROMA_400) if (((scu_y & 1) == 0 && dir) || (((scu_x & 1) == 0) && (!dir))) { int uv_offset = (scu_y << (MIN_CU_SIZE_IN_BIT - 1)) * stride_c + (scu_x << (MIN_CU_SIZE_IN_BIT - 1)); pel_t *src_u = p_dec[1] + uv_offset; pel_t *src_v = p_dec[2] + uv_offset; #if HDR_CHROMA_DELTA_QP int alpha[2], beta[2]; int luma_qp = QP; int offset = shift << 3; /* coded as 10/12 bit, QP is added by (8 * (h->param.sample_bit_depth - 8)) in config file */ QP = cu_get_chroma_qp(h, luma_qp, 0) - offset; alpha[0] = ALPHA_TABLE[DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_alpha_offset)] << shift; beta[0] = BETA_TABLE [DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_beta_offset )] << shift; QP = cu_get_chroma_qp(h, luma_qp, 1) - offset; alpha[1] = ALPHA_TABLE[DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_alpha_offset)] << shift; beta[1] = BETA_TABLE [DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_beta_offset )] << shift; gf_davs2.deblock_chroma[dir](src_u, src_v, stride_c, alpha, beta, b_filter_flag); #else int alpha, beta; /* coded as 10/12 bit, QP is added by (8 * (h->param.sample_bit_depth - 8)) in config file */ QP = cu_get_chroma_qp(h, QP, 0) - (shift << 3); alpha = ALPHA_TABLE[DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_alpha_offset)] << shift; beta = BETA_TABLE[DAVS2_CLIP3(0, max_qp_deblock, QP + h->i_beta_offset)] << shift; gf_davs2.deblock_chroma[dir](src_u, src_v, stride_c, alpha, beta, b_filter_flag); #endif } } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * NOTE: only support I420 now */ void davs2_lcu_deblock(davs2_t *h, davs2_frame_t *frm, int i_lcu_x, int i_lcu_y) { const int i_stride = frm->i_stride[0]; const int i_stride_c = frm->i_stride[1]; const int w_in_scu = h->i_width_in_scu; const int h_in_scu = h->i_height_in_scu; const int num_in_scu = 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int num_of_scu_hor = DAVS2_MIN(w_in_scu - scu_x, num_in_scu); int num_of_scu_ver = DAVS2_MIN(h_in_scu - scu_y, num_in_scu); int i, j; /* ------------------------------------------------------------- * init */ /* set edge flags in one LCU */ lf_lcu_set_edge_filter(h, h->i_lcu_level, scu_x, scu_y); /* ------------------------------------------------------------- * vertical */ /* deblock all vertical edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_VER); } } /* ------------------------------------------------------------- * horizontal */ /* adjust the value of scu_x and num_of_scu_hor */ if (scu_x == 0) { /* the current LCU is the first LCU in a LCU row */ num_of_scu_hor--; /* leave the last horizontal edge */ } else { /* the current LCU is one of the rest LCUs in a row */ if (scu_x + num_of_scu_hor == w_in_scu) { /* the current LCU is the last LCUs in a row, * need deblock one horizontal edge more */ num_of_scu_hor++; } scu_x--; /* begin from the last horizontal edge of previous LCU */ } /* deblock all horizontal edges in one LCU */ for (j = 0; j < num_of_scu_ver; j++) { for (i = 0; i < num_of_scu_hor; i++) { lf_scu_deblock(h, frm->planes, i_stride, i_stride_c, scu_x + i, scu_y + j, EDGE_HOR); } } } /* --------------------------------------------------------------------------- * init deblock function handles */ void davs2_deblock_init(uint32_t cpuid, ao_funcs_t* fh) { UNUSED_PARAMETER(cpuid); fh->deblock_luma [0] = deblock_edge_ver; fh->deblock_luma [1] = deblock_edge_hor; fh->deblock_chroma[0] = deblock_edge_ver_c; fh->deblock_chroma[1] = deblock_edge_hor_c; fh->set_deblock_const = NULL; /* init asm function handles */ #if HAVE_MMX if ((cpuid & DAVS2_CPU_SSE4) && !HDR_CHROMA_DELTA_QP) { #if !HIGH_BIT_DEPTH fh->deblock_luma [0] = deblock_edge_ver_sse128; fh->deblock_luma [1] = deblock_edge_hor_sse128; fh->deblock_chroma[0] = deblock_edge_ver_c_sse128; fh->deblock_chroma[1] = deblock_edge_hor_c_sse128; #endif } if ((cpuid & DAVS2_CPU_AVX2) && !HDR_CHROMA_DELTA_QP) { #if !HIGH_BIT_DEPTH // fh->deblock_luma[0] = deblock_edge_ver_avx2; // @luofl i7-6700K ˺ sse128 // fh->deblock_luma[1] = deblock_edge_hor_avx2; // fh->deblock_chroma[0] = deblock_edge_ver_c_avx2; // fh->deblock_chroma[1] = deblock_edge_hor_c_avx2; #endif } #endif // HAVE_MMX } davs2-1.6/source/common/deblock.h000066400000000000000000000032451337322544400167560ustar00rootroot00000000000000/* * deblock.h * * Description of this file: * Deblock functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_DEBLOCK_H #define DAVS2_DEBLOCK_H #ifdef __cplusplus extern "C" { #endif #define davs2_deblock_init FPFX(deblock_init) void davs2_deblock_init(uint32_t cpuid, ao_funcs_t* fh); #define davs2_lcu_deblock FPFX(lcu_deblock) void davs2_lcu_deblock(davs2_t *h, davs2_frame_t *frm, int i_lcu_x, int i_lcu_y); #ifdef __cplusplus } #endif #endif // DAVS2_DEBLOCK_H davs2-1.6/source/common/decoder.cc000066400000000000000000001272041337322544400171200ustar00rootroot00000000000000/* * decoder.cc * * Description of this file: * Decoder functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "davs2.h" #include "decoder.h" #include "aec.h" #include "header.h" #include "bitstream.h" #include "deblock.h" #include "cu.h" #include "sao.h" #include "alf.h" #include "quant.h" #include "frame.h" #include "intra.h" #include "mc.h" #include "transform.h" #include "cpu.h" #include "threadpool.h" #define TRACEFILE "trace_dec_HD.txt" /* trace file in current directory */ /* disable warning C4127: ʽdz */ #pragma warning(disable:4127) /** * =========================================================================== * local function defines * =========================================================================== */ /* --------------------------------------------------------------------------- * initializes the parameters for a new frame */ static void init_frame(davs2_t *h) { int num_spu = h->i_width_in_spu * h->i_height_in_spu; //int i; h->lcu.i_scu_xy = 0; h->i_slice_index = -1; h->b_slice_checked = 0; h->fdec->i_parsed_lcu_xy = -1; h->decoding_error = 0; // ־ /* 1, clear intra_mode buffer, set to default value (-1) */ memset(h->p_ipredmode - h->i_ipredmode - 16, DC_PRED, h->i_ipredmode * (h->i_height_in_spu + 1) * sizeof(int8_t)); memset(h->p_dirpred, PDIR_INVALID, num_spu * sizeof(int8_t)); /* 2, clear mv buffer (set all MVs to zero) */ gf_davs2.fast_memzero(h->p_ref_idx, num_spu * sizeof(ref_idx_t)); // gf_davs2.fast_memzero(h->p_tmv_1st, num_spu * sizeof(mv_t)); // gf_davs2.fast_memzero(h->p_tmv_2nd, num_spu * sizeof(mv_t)); /* 3, clear slice number for all SCU */ //repeat for init slice for current LCU //for (i = 0; i < h->i_size_in_scu; i++) { // h->scu_data[i].i_slice_nr = -1; //} /* 4, init adaptive frequency weighting quantization */ if (h->seq_info.enable_weighted_quant) { wq_init_frame_quant_param(h); wq_update_frame_matrix(h); } /* 5, copy frame properties for SAO & ALF */ if (h->b_sao) { davs2_frame_copy_properties(h->p_frame_sao, h->fdec); } if (h->b_alf) { int alf_enable = h->pic_alf_on[IMG_Y] != 0 || h->pic_alf_on[IMG_U] != 0 || h->pic_alf_on[IMG_V] != 0; if (alf_enable) { davs2_frame_copy_properties(h->p_frame_alf, h->fdec); } } /* 6, clear the p_deblock_flag buffer */ gf_davs2.fast_memzero(h->p_deblock_flag[0], h->i_width_in_scu * h->i_height_in_scu * 2 * sizeof(uint8_t)); /* 7, clear LCU info buffer */ #if CTRL_AEC_THREAD gf_davs2.fast_memzero(h->lcu_infos, sizeof(lcu_info_t) * h->i_width_in_lcu * h->i_height_in_lcu); #endif } /* --------------------------------------------------------------------------- * cache CTU border */ static INLINE void davs2_cache_lcu_border(pel_t *p_dst, const pel_t *p_top, const pel_t *p_left, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ memcpy(p_dst, p_top, (2 * lcu_width + 1) * sizeof(pel_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst[-i] = p_left[0]; p_left += i_left; } } /* --------------------------------------------------------------------------- * cache CTU border (UV components together) */ static INLINE void davs2_cache_lcu_border_uv(pel_t *p_dst_u, const pel_t *p_top_u, const pel_t *p_left_u, pel_t *p_dst_v, const pel_t *p_top_v, const pel_t *p_left_v, int i_left, int lcu_width, int lcu_height) { int i; /* top, top-right */ memcpy(p_dst_u, p_top_u, (2 * lcu_width + 1) * sizeof(pel_t)); memcpy(p_dst_v, p_top_v, (2 * lcu_width + 1) * sizeof(pel_t)); /* left */ for (i = 1; i <= lcu_height; i++) { p_dst_u[-i] = p_left_u[0]; p_dst_v[-i] = p_left_v[0]; p_left_u += i_left; p_left_v += i_left; } } /* --------------------------------------------------------------------------- */ static void save_mv_ref_info(davs2_t *h, int row) { const int w_in_spu = h->i_width_in_spu; const int h_in_spu = h->i_height_in_spu; const int spu_y = row << (h->i_lcu_level - MIN_PU_SIZE_IN_BIT); const int lcu_h_in_spu = 1 << (h->i_lcu_level - MIN_PU_SIZE_IN_BIT); mv_t *p_dst_mv = &h->fdec->mvbuf[spu_y * w_in_spu]; int8_t *p_dst_ref = &h->fdec->refbuf[spu_y * w_in_spu]; mv_t *p_src_mv; ref_idx_t *p_src_ref; int i, j, x, y; for (j = spu_y; j < DAVS2_MIN(spu_y + lcu_h_in_spu, h_in_spu); j++) { y = ((j >> MV_FACTOR_IN_BIT) << MV_FACTOR_IN_BIT) + 2; if (y >= h_in_spu) { y = (((j >> MV_FACTOR_IN_BIT) << MV_FACTOR_IN_BIT) + h_in_spu) >> 1; } p_src_mv = h->p_tmv_1st + y * w_in_spu; p_src_ref = h->p_ref_idx + y * w_in_spu; for (i = 0; i < w_in_spu; i++) { x = ((i >> MV_FACTOR_IN_BIT) << MV_FACTOR_IN_BIT) + 2; if (x >= w_in_spu) { x = (((i >> MV_FACTOR_IN_BIT) << MV_FACTOR_IN_BIT) + w_in_spu) >> 1; } p_dst_mv [i] = p_src_mv [x]; p_dst_ref[i] = p_src_ref[x].r[0]; } p_dst_mv += w_in_spu; p_dst_ref += w_in_spu; } } /* --------------------------------------------------------------------------- */ static davs2_outpic_t *get_one_free_picture(davs2_mgr_t *mgr, int w, int h) { davs2_outpic_t *pic = NULL; for (;;) { /* get one from recycle bin */ pic = (davs2_outpic_t *)xl_remove_head(&mgr->pic_recycle, 0); if ((pic == NULL) || (pic->pic->widths[0] == w && pic->pic->lines[0] == h)) { break; } /* obsolete picture */ free_picture(pic); pic = NULL; } if (pic == NULL) { /* no free picture. no wait, just new one. */ pic = alloc_picture(w, h); } return pic; } /* --------------------------------------------------------------------------- * ȴһLCUؽָLCU */ static ALWAYS_INLINE void wait_lcu_row_parsed(davs2_t *h, davs2_frame_t *frm, int lcu_xy) { UNUSED_PARAMETER(h); if (lcu_xy > frm->i_parsed_lcu_xy) { davs2_thread_mutex_lock(&frm->mutex_frm); /* lock */ while (lcu_xy > frm->i_parsed_lcu_xy) { davs2_thread_cond_wait(&frm->cond_aec, &frm->mutex_frm); } davs2_thread_mutex_unlock(&frm->mutex_frm); /* unlock */ } } /* --------------------------------------------------------------------------- * ȴһLCUعָLCU */ static ALWAYS_INLINE void wait_lcu_row_reconed(davs2_t *h, davs2_frame_t *frm, int wait_lcu_y, int wait_lcu_coded) { UNUSED_PARAMETER(h); // wait_lcu_coded = DAVS2_MIN(h->i_width_in_lcu, wait_lcu_coded); if (frm->num_decoded_lcu_in_row[wait_lcu_y] < wait_lcu_coded) { davs2_thread_mutex_lock(&frm->mutex_recon); /* lock */ while (frm->num_decoded_lcu_in_row[wait_lcu_y] < wait_lcu_coded) { davs2_thread_cond_wait(&frm->conds_lcu_row[wait_lcu_y], &frm->mutex_recon); } davs2_thread_mutex_unlock(&frm->mutex_recon); /* unlock */ } } /* --------------------------------------------------------------------------- */ static void decoder_signal(davs2_t *h, davs2_frame_t *frame, int line) { if (line > 0) { wait_lcu_row_reconed(h, frame, line - 1, h->i_width_in_lcu + 1); } davs2_thread_mutex_lock(&frame->mutex_recon); frame->i_decoded_line++; frame->num_decoded_lcu_in_row[line] = h->i_width_in_lcu + 3; davs2_thread_mutex_unlock(&frame->mutex_recon); davs2_thread_cond_broadcast(&frame->conds_lcu_row[line]); } /* --------------------------------------------------------------------------- */ static void task_send_picture_to_output_list(davs2_t *h, davs2_outpic_t *pic) { davs2_mgr_t *mgr = h->task_info.taskmgr; davs2_outpic_t *curr = NULL; davs2_outpic_t *prev = NULL; davs2_thread_mutex_lock(&mgr->mutex_mgr); curr = mgr->outpics.pics; while (curr && curr->frame->i_poc < pic->frame->i_poc) { prev = curr; curr = curr->next; } /* duplicate frame? */ if (curr != NULL && curr->frame->i_poc == pic->frame->i_poc) { davs2_log(h, DAVS2_LOG_WARNING, "detected duplicate POC %d", curr->frame->i_poc); } /* insert this frame before 'curr' */ pic->next = curr; if (prev) { prev->next = pic; } else { mgr->outpics.pics = pic; } mgr->outpics.num_output_pic++; DAVS2_ASSERT(h->task_info.task_status == TASK_BUSY, "Invalid task status %d", h->task_info.task_status); davs2_thread_mutex_unlock(&mgr->mutex_mgr); } /* --------------------------------------------------------------------------- */ static void task_output_decoding_frame(davs2_t *h) { davs2_mgr_t *mgr = h->task_info.taskmgr; davs2_frame_t *frame = h->fdec; davs2_seq_t *seqhead = &h->seq_info; davs2_outpic_t *pic = NULL; assert(frame); pic = get_one_free_picture(mgr, h->i_image_width, h->i_image_height); assert(pic); memcpy(pic->head, &seqhead->head, sizeof(davs2_seq_info_t)); if (frame->i_type == AVS2_GB_SLICE) { pic->frame = h->f_background_ref; ///!!! FIXME: actually NOT working (we do not support S frames now). } else { pic->frame = frame; } frame->i_chroma_format = h->i_chroma_format; frame->i_output_bit_depth = h->output_bit_depth; frame->i_sample_bit_depth = h->sample_bit_depth; frame->frm_decode_error = h->decoding_error; h->decoding_error = 0; // clear decoding error status pic->frame = frame; task_send_picture_to_output_list(h, pic); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int check_slice_header(davs2_t *h, davs2_bs_t *bs, int lcu_y) { aec_t *p_aec = &h->aec; if (h->b_slice_checked && found_slice_header(bs)) { /* slice starts at next byte */ bs->i_bit_pos = (((bs->i_bit_pos + 7) >> 3) << 3); h->i_slice_index++; parse_slice_header(h, bs); aec_init_contexts(p_aec); aec_new_slice(h); aec_start_decoding(p_aec, bs->p_stream, ((bs->i_bit_pos + 7) / 8), bs->i_stream); AEC_RETURN_ON_ERROR(-1); /* ǰSliceһеԤģʽ */ lcu_y <<= (h->i_lcu_level - MIN_PU_SIZE_IN_BIT); memset(h->p_ipredmode + (lcu_y - 1) * h->i_ipredmode - 16, DC_PRED, h->i_ipredmode * sizeof(int8_t)); } return 0; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void rowrec_store_lcu_recon_samples(davs2_row_rec_t *row_rec) { #if 1 UNUSED_PARAMETER(row_rec); #else gf_davs2.plane_copy(row_rec->ctu.p_frec[0], row_rec->ctu.i_frec[0], row_rec->ctu.p_fdec[0], row_rec->ctu.i_fdec[0], row_rec->ctu.i_ctu_w, row_rec->ctu.i_ctu_h); gf_davs2.plane_copy(row_rec->ctu.p_frec[1], row_rec->ctu.i_frec[1], row_rec->ctu.p_fdec[1], row_rec->ctu.i_fdec[1], row_rec->ctu.i_ctu_w_c, row_rec->ctu.i_ctu_h_c); gf_davs2.plane_copy(row_rec->ctu.p_frec[2], row_rec->ctu.i_frec[2], row_rec->ctu.p_fdec[2], row_rec->ctu.i_fdec[2], row_rec->ctu.i_ctu_w_c, row_rec->ctu.i_ctu_h_c); #endif } /* --------------------------------------------------------------------------- * decodes one LCU row */ static int decode_one_lcu_row(davs2_t *h, davs2_bs_t *bs, int i_lcu_y) { const int height_in_lcu = h->i_height_in_lcu; const int width_in_lcu = h->i_width_in_lcu; int alf_enable = h->pic_alf_on[0] | h->pic_alf_on[1] | h->pic_alf_on[2]; int lcu_xy = i_lcu_y * width_in_lcu; int i_lcu_x; int i; davs2_row_rec_t row_rec; /* loop over all LCUs in current LCU row ------------------------ */ for (i_lcu_x = 0; i_lcu_x < width_in_lcu && h->decoding_error == 0; i_lcu_x++, lcu_xy++) { if (check_slice_header(h, bs, i_lcu_y) < 0) { return -1; } #if AVS2_TRACE avs2_trace("\n*********** Pic: %i (I/P) MB: %i Slice: %i Type %d **********\n", h->i_poc, h->lcu.i_scu_xy, h->i_slice_index, h->i_frame_type); #endif h->lcu.lcu_aec = row_rec.lcu_info = &h->lcu_infos[lcu_xy]; rowrec_lcu_init(h, &row_rec, i_lcu_x, i_lcu_y); decode_lcu_init(h, i_lcu_x, i_lcu_y); /* decode LCU level data before one LCU */ if (h->b_sao) { sao_read_lcu_param(h, lcu_xy, h->slice_sao_on, &h->lcu.lcu_aec->sao_param); } if (h->b_alf) { for (i = 0; i < IMG_COMPONENTS; i++) { if (h->pic_alf_on[i]) { h->lcu.lcu_aec->enable_alf[i] = (uint8_t)aec_read_alf_lcu_ctrl(&h->aec); } else { h->lcu.lcu_aec->enable_alf[i] = FALSE; } } } /* decode one lcu */ decode_lcu_parse(h, h->i_lcu_level, h->lcu.i_pix_x, h->lcu.i_pix_y); /* cache CTU top border for intra prediction */ if (i_lcu_x == 0) { memcpy(row_rec.ctu_border[0].rec_top + 1, h->intra_border[0], row_rec.ctu.i_ctu_w * 2 * sizeof(pel_t)); memcpy(row_rec.ctu_border[1].rec_top + 1, h->intra_border[1], row_rec.ctu.i_ctu_w * sizeof(pel_t)); memcpy(row_rec.ctu_border[2].rec_top + 1, h->intra_border[2], row_rec.ctu.i_ctu_w * sizeof(pel_t)); } decode_lcu_recon(h, &row_rec, h->i_lcu_level, h->lcu.i_pix_x, h->lcu.i_pix_y); rowrec_store_lcu_recon_samples(&row_rec); /* cache top and left samples for intra prediction of next CTU */ davs2_cache_lcu_border(row_rec.ctu_border[0].rec_top, h->intra_border[0] + row_rec.ctu.i_pix_x + row_rec.ctu.i_ctu_w - 1, row_rec.ctu.p_frec[0] + row_rec.ctu.i_ctu_w - 1, row_rec.ctu.i_frec[0], row_rec.ctu.i_ctu_w, row_rec.ctu.i_ctu_h); davs2_cache_lcu_border_uv(row_rec.ctu_border[1].rec_top, h->intra_border[1] + row_rec.ctu.i_pix_x_c + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.p_frec[1] + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu_border[2].rec_top, h->intra_border[2] + row_rec.ctu.i_pix_x_c + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.p_frec[2] + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.i_frec[1], row_rec.ctu.i_ctu_w_c, row_rec.ctu.i_ctu_h_c); /* backup bottom row pixels */ if (i_lcu_y < h->i_height_in_lcu - 1) { memcpy(h->intra_border[0] + row_rec.ctu.i_pix_x , row_rec.ctu.p_frec[0] + (row_rec.ctu.i_ctu_h - 1) * h->fdec->i_stride[0], row_rec.ctu.i_ctu_w * sizeof(pel_t)); memcpy(h->intra_border[1] + row_rec.ctu.i_pix_x_c, row_rec.ctu.p_frec[1] + (row_rec.ctu.i_ctu_h_c - 1) * h->fdec->i_stride[1], row_rec.ctu.i_ctu_w_c * sizeof(pel_t)); memcpy(h->intra_border[2] + row_rec.ctu.i_pix_x_c, row_rec.ctu.p_frec[2] + (row_rec.ctu.i_ctu_h_c - 1) * h->fdec->i_stride[1], row_rec.ctu.i_ctu_w_c * sizeof(pel_t)); } /* decode LCU level data after one LCU * update the bit position */ h->b_slice_checked = (bool_t)aec_startcode_follows(&h->aec, 1); bs->i_bit_pos = aec_bits_read(&h->aec); /* deblock one lcu */ if (h->b_loop_filter) { davs2_lcu_deblock(h, h->fdec, i_lcu_x, i_lcu_y); } } if (h->decoding_error != 0) { } else { /* SAO current lcu-row */ if (h->b_sao) { sao_lcurow(h, h->p_frame_sao, h->fdec, i_lcu_y); } /* ALF current lcu-row */ if (alf_enable) { alf_lcurow(h, h->p_alf->img_param, h->p_frame_alf, h->fdec, i_lcu_y); } } /* save motion vectors for reference frame */ if (h->rps.refered_by_others && h->i_frame_type != AVS2_I_SLICE) { save_mv_ref_info(h, i_lcu_y); } /* frame padding : line by line */ if (h->rps.refered_by_others) { pad_line_lcu(h, i_lcu_y); /* wake up all waiting threads */ decoder_signal(h, h->fdec, i_lcu_y); } if (i_lcu_y == height_in_lcu - 1) { /* init for AVS-S */ if ((h->i_frame_type == AVS2_P_SLICE || h->i_frame_type == AVS2_F_SLICE) && h->b_bkgnd_picture && h->b_bkgnd_reference) { const int w_in_spu = h->i_width_in_spu; const int h_in_spu = h->i_height_in_spu; int x, y; for (y = 0; y < h_in_spu; y++) { for (x = 0; x < w_in_spu; x++) { int refframe = h->p_ref_idx[y * w_in_spu + x].r[0]; if (refframe == h->num_of_references - 1) { h->p_ref_idx[y * w_in_spu + x].r[0] = INVALID_REF; } } } } task_output_decoding_frame(h); task_release_frames(h); /* task is free */ task_unload_packet(h, h->task_info.curr_es_unit); // davs2_thread_mutex_lock(&h->task_info.taskmgr->mutex_aec); // h->task_info.taskmgr->num_active_decoders--; // davs2_thread_mutex_unlock(&h->task_info.taskmgr->mutex_aec); } return 0; } // #if CTRL_AEC_THREAD /* --------------------------------------------------------------------------- * decodes one LCU row */ static int decode_one_lcu_row_parse(davs2_t *h, davs2_bs_t *bs, int i_lcu_y) { const int width_in_lcu = h->i_width_in_lcu; int lcu_xy = i_lcu_y * width_in_lcu; int i_lcu_x; int i; /* loop over all LCUs in current LCU row ------------------------ */ for (i_lcu_x = 0; i_lcu_x < width_in_lcu; i_lcu_x++, lcu_xy++) { if (check_slice_header(h, bs, i_lcu_y) < 0) { return -1; } #if AVS2_TRACE avs2_trace("\n*********** Pic: %i (I/P) MB: %i Slice: %i Type %d **********\n", h->i_poc, h->lcu.i_scu_xy, h->i_slice_index, h->i_frame_type); #endif h->lcu.lcu_aec = &h->lcu_infos[lcu_xy]; decode_lcu_init(h, i_lcu_x, i_lcu_y); /* decode LCU level data before one LCU */ if (h->b_sao) { sao_read_lcu_param(h, lcu_xy, h->slice_sao_on, &h->lcu.lcu_aec->sao_param); } if (h->b_alf) { for (i = 0; i < IMG_COMPONENTS; i++) { if (h->pic_alf_on[i]) { h->lcu.lcu_aec->enable_alf[i] = (uint8_t)aec_read_alf_lcu_ctrl(&h->aec); } else { h->lcu.lcu_aec->enable_alf[i] = FALSE; } } } /* decode one lcu */ decode_lcu_parse(h, h->i_lcu_level, h->lcu.i_pix_x, h->lcu.i_pix_y); /* decode LCU level data after one LCU * update the bit position */ h->b_slice_checked = (bool_t)aec_startcode_follows(&h->aec, 1); bs->i_bit_pos = aec_bits_read(&h->aec); h->fdec->i_parsed_lcu_xy = lcu_xy; davs2_thread_cond_broadcast(&h->fdec->cond_aec); } /* save motion vectors for reference frame */ if (h->rps.refered_by_others && h->i_frame_type != AVS2_I_SLICE) { save_mv_ref_info(h, i_lcu_y); } return 0; } /* --------------------------------------------------------------------------- * decodes one LCU row */ static int decode_lcu_row_recon(davs2_t *h, int i_lcu_y) { const int width_in_lcu = h->i_width_in_lcu; const int height_in_lcu = h->i_height_in_lcu; int alf_enable = h->pic_alf_on[0] | h->pic_alf_on[1] | h->pic_alf_on[2]; int i_lcu_level = h->i_lcu_level; int lcu_xy = i_lcu_y * h->i_width_in_lcu; int b_recon_finish = 0; int b_next_row_launched = 0; davs2_row_rec_t row_rec; while (i_lcu_y < height_in_lcu) { /* loop over all LCUs in current LCU row ------------------------ */ int i_lcu_x; for (i_lcu_x = 0; i_lcu_x < width_in_lcu; i_lcu_x++, lcu_xy++) { /* wait until the parsing process of current LCU having finished */ wait_lcu_row_parsed(h, h->fdec, lcu_xy); if (i_lcu_y > 0) { wait_lcu_row_reconed(h, h->fdec, i_lcu_y - 1, DAVS2_MIN(i_lcu_x + 2, h->i_width_in_lcu)); } row_rec.lcu_info = &h->lcu_infos[lcu_xy]; #if CTRL_AEC_THREAD row_rec.p_rec_info = &row_rec.lcu_info->rec_info; #endif rowrec_lcu_init(h, &row_rec, i_lcu_x, i_lcu_y); /* cache CTU top border for intra prediction */ if (i_lcu_x == 0) { memcpy(row_rec.ctu_border[0].rec_top + 1, h->intra_border[0], row_rec.ctu.i_ctu_w * 2 * sizeof(pel_t)); memcpy(row_rec.ctu_border[1].rec_top + 1, h->intra_border[1], row_rec.ctu.i_ctu_w * sizeof(pel_t)); memcpy(row_rec.ctu_border[2].rec_top + 1, h->intra_border[2], row_rec.ctu.i_ctu_w * sizeof(pel_t)); } decode_lcu_recon(h, &row_rec, i_lcu_level, i_lcu_x << i_lcu_level, i_lcu_y << i_lcu_level); rowrec_store_lcu_recon_samples(&row_rec); /* cache top and left samples for intra prediction of next CTU */ davs2_cache_lcu_border(row_rec.ctu_border[0].rec_top, h->intra_border[0] + row_rec.ctu.i_pix_x + row_rec.ctu.i_ctu_w - 1, row_rec.ctu.p_frec[0] + row_rec.ctu.i_ctu_w - 1, row_rec.ctu.i_frec[0], row_rec.ctu.i_ctu_w, row_rec.ctu.i_ctu_h); davs2_cache_lcu_border_uv(row_rec.ctu_border[1].rec_top, h->intra_border[1] + row_rec.ctu.i_pix_x_c + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.p_frec[1] + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu_border[2].rec_top, h->intra_border[2] + row_rec.ctu.i_pix_x_c + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.p_frec[2] + row_rec.ctu.i_ctu_w_c - 1, row_rec.ctu.i_frec[1], row_rec.ctu.i_ctu_w_c, row_rec.ctu.i_ctu_h_c); /* backup bottom row pixels */ if (i_lcu_y < h->i_height_in_lcu - 1) { memcpy(h->intra_border[0] + row_rec.ctu.i_pix_x, row_rec.ctu.p_frec[0] + (row_rec.ctu.i_ctu_h - 1) * h->fdec->i_stride[0], row_rec.ctu.i_ctu_w * sizeof(pel_t)); memcpy(h->intra_border[1] + row_rec.ctu.i_pix_x_c, row_rec.ctu.p_frec[1] + (row_rec.ctu.i_ctu_h_c - 1) * h->fdec->i_stride[1], row_rec.ctu.i_ctu_w_c * sizeof(pel_t)); memcpy(h->intra_border[2] + row_rec.ctu.i_pix_x_c, row_rec.ctu.p_frec[2] + (row_rec.ctu.i_ctu_h_c - 1) * h->fdec->i_stride[1], row_rec.ctu.i_ctu_w_c * sizeof(pel_t)); } /* deblock one lcu */ if (h->b_loop_filter) { davs2_lcu_deblock(h, h->fdec, i_lcu_x, i_lcu_y); } h->fdec->num_decoded_lcu_in_row[i_lcu_y]++; } /* SAO above lcu-row */ if (h->b_sao && i_lcu_y) { sao_lcurow(h, h->p_frame_sao, h->fdec, i_lcu_y - 1); // above row if (i_lcu_y == height_in_lcu - 1) { sao_lcurow(h, h->p_frame_sao, h->fdec, i_lcu_y); // last row } } /* ALF above lcu-row */ if (alf_enable && i_lcu_y) { alf_lcurow(h, h->p_alf->img_param, h->p_frame_alf, h->fdec, i_lcu_y - 1); // above row if (i_lcu_y == height_in_lcu - 1) { alf_lcurow(h, h->p_alf->img_param, h->p_frame_alf, h->fdec, i_lcu_y); // last row } } if (i_lcu_y > 0) { /* frame padding : line by line */ if (h->rps.refered_by_others) { pad_line_lcu(h, i_lcu_y - 1); } /* wake up all waiting threads */ decoder_signal(h, h->fdec, i_lcu_y - 1); } /* The last row in one frame */ if (i_lcu_y == height_in_lcu - 1) { b_recon_finish = 1; } /* TODO: loop to next LCU row */ if (b_next_row_launched) { break; } i_lcu_y++; } /* the bottom LCU row in a frame */ if (b_recon_finish) { if (h->rps.refered_by_others) { pad_line_lcu(h, h->i_height_in_lcu - 1); } decoder_signal(h, h->fdec, h->i_height_in_lcu - 1); /* init for AVS-S */ if ((h->i_frame_type == AVS2_P_SLICE || h->i_frame_type == AVS2_F_SLICE) && h->b_bkgnd_picture && h->b_bkgnd_reference) { const int w_in_spu = h->i_width_in_spu; const int h_in_spu = h->i_height_in_spu; int x, y; for (y = 0; y < h_in_spu; y++) { for (x = 0; x < w_in_spu; x++) { int refframe = h->p_ref_idx[y * w_in_spu + x].r[0]; if (refframe == h->num_of_references - 1) { h->p_ref_idx[y * w_in_spu + x].r[0] = INVALID_REF; } } } } // davs2_log(h, DAVS2_LOG_INFO, "POC %3d reconstruction finished.", h->i_poc); if (h->i_frame_type == AVS2_G_SLICE) { davs2_frame_copy_planes(h->f_background_ref, h->fdec); } task_output_decoding_frame(h); task_release_frames(h); /* task is free */ task_unload_packet(h, h->task_info.curr_es_unit); } return 0; } // #endif // #if CTRL_AEC_THREAD /* --------------------------------------------------------------------------- */ static void decode_user_data(davs2_t *h, davs2_bs_t *bs) { int bytes = bs->i_bit_pos >> 3; int left = bs->i_stream - bytes; uint8_t *data = bs->p_stream + bytes; while (left >= 4) { if (data[0] == 0 && data[1] == 0 && data[2] == 1) { if (data[3] == SC_USER_DATA) { /* user data */ } else if (data[3] <= SC_SLICE_CODE_MAX) { /* slice */ h->b_slice_checked = 1; break; } data += 4; left -= 4; } else { ++data; --left; } } if (left >= 4) { bs->i_bit_pos = (int)((data - bs->p_stream) << 3); } } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ void decoder_free_extra_buffer(davs2_t *h) { if (h->f_background_ref) { davs2_frame_destroy(h->f_background_ref); h->f_background_ref = NULL; } if (h->f_background_cur) { davs2_frame_destroy(h->f_background_cur); h->f_background_cur = NULL; } if (h->p_frame_alf) { davs2_frame_destroy(h->p_frame_alf); h->p_frame_alf = NULL; } if (h->p_frame_sao) { davs2_frame_destroy(h->p_frame_sao); h->p_frame_sao = NULL; } if (h->p_integral) { davs2_free(h->p_integral); h->p_integral = NULL; } } /* --------------------------------------------------------------------------- * alloc extra buffers for the decoder according to the image width & height */ int decoder_alloc_extra_buffer(davs2_t *h) { size_t w_in_spu = h->i_width_in_spu; size_t h_in_spu = h->i_height_in_spu; size_t w_in_scu = h->i_width_in_scu; size_t h_in_scu = h->i_height_in_scu; size_t size_in_spu = w_in_spu * h_in_spu; size_t size_in_lcu = ((h->i_width + h->i_lcu_size_sub1) >> h->i_lcu_level) * ((h->i_height + h->i_lcu_size_sub1) >> h->i_lcu_level); size_t size_alf = alf_get_buffer_size(h); size_t size_extra_frame = 0; size_t mem_size; uint8_t *mem_base; assert((h->i_width & 7) == 0); assert((h->i_height & 7) == 0); size_extra_frame = 2 * davs2_frame_get_size(h->i_width, h->i_height, h->i_chroma_format, 1); size_extra_frame += (h->b_alf + h->b_sao) * davs2_frame_get_size(h->i_width, h->i_height, h->i_chroma_format, 0); mem_size = sizeof(int8_t) * (w_in_spu + 16) * (h_in_spu + 1) + /* M1, size of intra prediction mode buffer */ sizeof(int8_t) * size_in_spu + /* M3, size of prediction direction buffer */ sizeof(ref_idx_t) * size_in_spu + /* M3, size of reference index (1st+2nd) buffer */ sizeof(mv_t) * size_in_spu + /* M5, size of motion vector of 4x4 block (1st reference) buffer */ sizeof(mv_t) * size_in_spu + /* M6, size of motion vector of 4x4 block (2nd reference) buffer */ sizeof(uint8_t) * w_in_scu * h_in_scu * 2 + /* M7, size of loop filter flag buffer */ sizeof(lcu_info_t) * size_in_lcu + /* M8, size of SAO block parameter buffer */ sizeof(cu_t) * h->i_size_in_scu + /* M10, size of cu_t */ sizeof(pel_t) * h->i_width * 3 + /* M13, size of last LCU row bottom border */ size_alf + /* M11, size of ALF */ size_extra_frame + /* M12, size of extra frame */ CACHE_LINE_SIZE * 20; /* allocate memory for a decoder */ CHECKED_MALLOC(mem_base, uint8_t *, mem_size); h->p_integral = mem_base; /* pointer which holds the extra buffer */ /* M1, intra prediction mode buffer */ h->p_ipredmode = (int8_t *)mem_base; mem_base += sizeof(int8_t) * (w_in_spu + 16) * (h_in_spu + 1); h->p_ipredmode += (w_in_spu + 16) + 16; h->i_ipredmode = (w_in_spu + 16); ALIGN_POINTER(mem_base); /* M3, prediction direction buffer */ h->p_dirpred = (int8_t *)mem_base; mem_base += sizeof(int8_t) * size_in_spu; ALIGN_POINTER(mem_base); /* M3, reference index (1st) buffer */ h->p_ref_idx = (ref_idx_t *)mem_base; mem_base += sizeof(ref_idx_t) * size_in_spu; ALIGN_POINTER(mem_base); /* M5, motion vector of 4x4 block (1st reference) buffer */ h->p_tmv_1st = (mv_t *)mem_base; mem_base += sizeof(mv_t) * size_in_spu; ALIGN_POINTER(mem_base); /* M6, motion vector of 4x4 block (1st reference) buffer */ h->p_tmv_2nd = (mv_t *)mem_base; mem_base += sizeof(mv_t) * size_in_spu; ALIGN_POINTER(mem_base); /* M7, loop filter flag buffer */ h->p_deblock_flag[0] = (uint8_t *)mem_base; mem_base += sizeof(uint8_t) * w_in_scu * h_in_scu; h->p_deblock_flag[1] = (uint8_t *)mem_base; mem_base += sizeof(uint8_t) * w_in_scu * h_in_scu; ALIGN_POINTER(mem_base); /* M8, LCU level parameter buffer */ h->lcu_infos = (lcu_info_t *)mem_base; mem_base += sizeof(lcu_info_t) * size_in_lcu; ALIGN_POINTER(mem_base); /* allocate memory for scu_data */ h->scu_data = (cu_t *)mem_base; mem_base += h->i_size_in_scu * sizeof(cu_t); ALIGN_POINTER(mem_base); /* LCU bottom border */ h->intra_border[0] = (pel_t *)mem_base; mem_base += h->i_width * sizeof(pel_t); ALIGN_POINTER(mem_base); h->intra_border[1] = (pel_t *)mem_base; mem_base += h->i_width * sizeof(pel_t); ALIGN_POINTER(mem_base); h->intra_border[2] = (pel_t *)mem_base; mem_base += h->i_width * sizeof(pel_t); ALIGN_POINTER(mem_base); /* ALF */ h->p_alf = (alf_var_t *)mem_base; mem_base += size_alf; ALIGN_POINTER(mem_base); alf_init_buffer(h); /* ------------------------------------------------------------- * allocate frame buffers */ // AVS-S h->f_background_ref = davs2_frame_new(h->i_width, h->i_height, h->i_chroma_format, &mem_base, 1); ALIGN_POINTER(mem_base); h->f_background_cur = davs2_frame_new(h->i_width, h->i_height, h->i_chroma_format, &mem_base, 1); ALIGN_POINTER(mem_base); // ALF if (h->b_alf) { h->p_frame_alf = davs2_frame_new(h->i_width, h->i_height, h->i_chroma_format, &mem_base, 0); ALIGN_POINTER(mem_base); } // SAO if (h->b_sao) { h->p_frame_sao = davs2_frame_new(h->i_width, h->i_height, h->i_chroma_format, &mem_base, 0); ALIGN_POINTER(mem_base); } if (mem_size < (mem_base - h->p_integral)) { davs2_log(h, DAVS2_LOG_ERROR, "No enough memory allocated. mem_size %llu <= %llu\n", mem_size, mem_base - h->p_integral); goto fail; } return 0; fail: decoder_free_extra_buffer(h); return -1; } /* --------------------------------------------------------------------------- * write a frame to output picture */ void davs2_write_a_frame(davs2_picture_t *pic, davs2_frame_t *frame) { int img_width = pic->widths[0]; int img_height = pic->lines[0]; int img_width_c = (img_width / 2); int img_height_c = (img_height / (frame->i_chroma_format == CHROMA_420 ? 2 : 1)); int num_bytes_per_sample = (frame->i_output_bit_depth == 8 ? 1 : 2); int shift1 = frame->i_sample_bit_depth - frame->i_output_bit_depth; // assuming "sample_bit_depth" is greater or equal to "output_bit_depth" pel_t *p_src; uint8_t *p_dst; int k, j, i_src, i_dst; pic->num_planes = (frame->i_chroma_format != CHROMA_400) ? 3 : 1; pic->bytes_per_sample = num_bytes_per_sample; pic->bit_depth = frame->i_output_bit_depth; pic->b_decode_error = frame->frm_decode_error; pic->dec_frame = NULL; pic->strides[0] = pic->widths[0] * num_bytes_per_sample; pic->strides[1] = pic->widths[1] * num_bytes_per_sample; pic->strides[2] = pic->widths[2] * num_bytes_per_sample; if (!shift1 && sizeof(pel_t) == num_bytes_per_sample) { pic->dec_frame = frame; // TODO: ¸ֵǰָҪʵʱ򣨽֧ʱָ pic->planes[0] = frame->planes[0]; pic->planes[1] = frame->planes[1]; pic->planes[2] = frame->planes[2]; pic->strides[0] = frame->i_stride[0] * num_bytes_per_sample; pic->strides[1] = frame->i_stride[1] * num_bytes_per_sample; pic->strides[2] = frame->i_stride[2] * num_bytes_per_sample; } else if (!shift1 && frame->i_output_bit_depth == 8) { // 8bit encode -> 8bit output p_dst = pic->planes[0]; i_dst = pic->strides[0]; p_src = frame->planes[0]; i_src = frame->i_stride[0]; for (j = 0; j < img_height; j++) { for (k = 0; k < img_width; k++) { p_dst[k] = (uint8_t)p_src[k]; } p_src += i_src; p_dst += i_dst; } if (pic->num_planes == 3) { p_dst = pic->planes[1]; i_dst = pic->strides[1]; p_src = frame->planes[1]; i_src = frame->i_stride[1]; for (j = 0; j < img_height_c; j++) { for (k = 0; k < img_width_c; k++) { p_dst[k] = (uint8_t)p_src[k]; } p_src += i_src; p_dst += i_dst; } p_dst = pic->planes[2]; i_dst = pic->strides[2]; p_src = frame->planes[2]; i_src = frame->i_stride[2]; for (j = 0; j < img_height_c; j++) { for (k = 0; k < img_width_c; k++) { p_dst[k] = (uint8_t)p_src[k]; } p_src += i_src; p_dst += i_dst; } } } else if (shift1 && frame->i_output_bit_depth == 8) { // 10bit encode -> 8bit output p_dst = pic->planes[0]; i_dst = pic->strides[0]; p_src = frame->planes[0]; i_src = frame->i_stride[0]; for (j = 0; j < img_height; j++) { for (k = 0; k < img_width; k++) { p_dst[k] = (uint8_t)DAVS2_CLIP1((p_src[k] + (1 << (shift1 - 1))) >> shift1); } p_src += i_src; p_dst += i_dst; } if (pic->num_planes == 3) { p_dst = pic->planes[1]; i_dst = pic->strides[1]; p_src = frame->planes[1]; i_src = frame->i_stride[1]; for (j = 0; j < img_height_c; j++) { for (k = 0; k < img_width_c; k++) { p_dst[k] = (uint8_t)DAVS2_CLIP1((p_src[k] + (1 << (shift1 - 1))) >> shift1); } p_src += i_src; p_dst += i_dst; } p_dst = pic->planes[2]; i_dst = pic->strides[2]; p_src = frame->planes[2]; i_src = frame->i_stride[2]; for (j = 0; j < img_height_c; j++) { for (k = 0; k < img_width_c; k++) { p_dst[k] = (uint8_t)DAVS2_CLIP1((p_src[k] + (1 << (shift1 - 1))) >> shift1); } p_src += i_src; p_dst += i_dst; } } } pic->type = frame->i_type; pic->qp = frame->i_qp; pic->pts = frame->i_pts; pic->dts = frame->i_dts; pic->pic_order_count = frame->i_poc; } /* --------------------------------------------------------------------------- */ davs2_t *decoder_open(davs2_mgr_t *mgr, davs2_t *h, int idx_decoder) { /* allocate memory for a decoder */ memset(h, 0, sizeof(davs2_t)); /* init log module */ h->module_log.i_log_level = mgr->param.info_level; sprintf(h->module_log.module_name, "Dec[%2d] %06llx", idx_decoder, h); /* only initialize some variables, not ready to work */ h->task_info.taskmgr = mgr; h->i_width = -1; h->i_height = -1; h->i_frame_type = AVS2_I_SLICE; h->num_of_references = 0; h->b_video_edit_code = 0; #if AVS2_TRACE if (avs2_trace_init(h, TRACEFILE) == -1) { // append new statistic at the end davs2_log(h, DAVS2_LOG_ERROR, "Error open trace file!"); } #endif return h; } /** * --------------------------------------------------------------------------- * Function : decode one frame * Parameters : * [in] : h - pointer to struct davs2_t (decoder handler) * : es_unit - pointer to bit-stream buffer (including the following parameters) * : data - pointer to bitstream buffer * : len - data length in bitstream buffer * : pts - user pts * : dts - user dts * Return : none * --------------------------------------------------------------------------- */ void *decoder_decode_picture_data(void *arg1, int arg2) { davs2_t *h = (davs2_t *)arg1; davs2_bs_t *bs = h->p_bs; UNUSED_PARAMETER(arg2); /* decode one frame */ init_frame(h); /* user data and slice header */ decode_user_data(h, bs); /* decode picture data */ if (h->b_slice_checked != 0) { davs2_frame_t *frame = h->fref[0]; davs2_mgr_t *mgr = h->task_info.taskmgr; const int height_in_lcu = h->i_height_in_lcu; int lcu_y; /* reset LCU decoding status */ memset(h->fdec->num_decoded_lcu_in_row, 0, sizeof(int) * h->i_height_in_lcu); // davs2_thread_mutex_lock(&mgr->mutex_aec); // mgr->num_active_decoders++; // davs2_thread_mutex_unlock(&mgr->mutex_aec); if (mgr->num_rec_thread && davs2_threadpool_is_free((davs2_threadpool_t *)mgr->thread_pool)) { /* make sure all its dependency frames have started reconstruction */ int i; for (i = 0; i < h->num_of_references; i++) { davs2_frame_t *frm = h->fref[i]; decoder_wait_lcu_row(h, frm, 0); } /* run reconstruction thread */ davs2_threadpool_run((davs2_threadpool_t *)mgr->thread_pool, (davs2_threadpool_func_t)decode_lcu_row_recon, h, 0, 0); /* ------------------------------------------------------------- * parse all LCU rows */ for (lcu_y = 0; lcu_y < height_in_lcu; lcu_y++) { /* TODO: remove the dependency in this thread */ if (frame != NULL) { decoder_wait_lcu_row(h, frame, lcu_y); } /* parsing the LCU data */ decode_one_lcu_row_parse(h, bs, lcu_y); } } else { /* ------------------------------------------------------------- * decode all LCU rows */ for (lcu_y = 0; lcu_y < height_in_lcu; lcu_y++) { if (frame != NULL) { decoder_wait_lcu_row(h, frame, lcu_y); } /* decode one lcu row */ decode_one_lcu_row(h, bs, lcu_y); } } } else { ///!!! make sure that all row signals of frames with 'b_refered_by_others == 1' have been set before return. /// use 'goto fail' instead of 'return' in the half way. if (h->rps.refered_by_others) { // set all row signals before returning. int lcu_y; for (lcu_y = 0; lcu_y < h->i_height_in_lcu; ++lcu_y) { decoder_signal(h, h->fdec, lcu_y); } } if (h->i_frame_type == AVS2_G_SLICE) { davs2_frame_copy_planes(h->f_background_ref, h->fdec); } /* task is free */ task_unload_packet(h, h->task_info.curr_es_unit); } return NULL; } /** * --------------------------------------------------------------------------- * Function : close the AVS2 decoder * Parameters : * [in] : h - pointer to struct davs2_t, the decoder handle * Return : none * --------------------------------------------------------------------------- */ void decoder_close(davs2_t *h) { /* free extra buffer */ decoder_free_extra_buffer(h); #if AVS2_TRACE /* destroy the trace */ avs2_trace_destroy(); #endif } davs2-1.6/source/common/decoder.h000066400000000000000000000053601337322544400167600ustar00rootroot00000000000000/* * decoder.h * * Description of this file: * Decoder functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_DECODER_H #define DAVS2_DECODER_H #ifdef __cplusplus extern "C" { #endif #include "common.h" #define decoder_open FPFX(decoder_decoder_open) davs2_t *decoder_open(davs2_mgr_t *mgr, davs2_t *h, int idx_decoder); #define decoder_decode_picture_data FPFX(decoder_decode_picture_data) void *decoder_decode_picture_data(void *arg1, int arg2); #define decoder_close FPFX(decoder_decoder_close) void decoder_close(davs2_t *h); #define create_freepictures FPFX(create_freepictures) int create_freepictures(davs2_mgr_t *mgr, int w, int h, int size); #define destroy_freepictures FPFX(destroy_freepictures) void destroy_freepictures(davs2_mgr_t *mgr); #define decoder_alloc_extra_buffer FPFX(decoder_alloc_extra_buffer) int decoder_alloc_extra_buffer(davs2_t *h); #define decoder_free_extra_buffer FPFX(decoder_free_extra_buffer) void decoder_free_extra_buffer(davs2_t *h); #define davs2_write_a_frame FPFX(write_a_frame) void davs2_write_a_frame(davs2_picture_t *pic, davs2_frame_t *frame); #define task_get_references FPFX(task_get_references) int task_get_references(davs2_t *h, int64_t pts, int64_t dts); #define task_unload_packet FPFX(task_unload_packet) void task_unload_packet(davs2_t *h, es_unit_t *es_unit); #define decoder_get_output FPFX(decoder_get_output) int decoder_get_output(davs2_mgr_t *mgr, davs2_seq_info_t *headerset, davs2_picture_t *out_frame, int is_flush); #ifdef __cplusplus } #endif #endif // DAVS2_DECODER_H davs2-1.6/source/common/defines.h000066400000000000000000000317771337322544400170030ustar00rootroot00000000000000/* * defines.h * * Description of this file: * const variable definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_DEFINES_H #define DAVS2_DEFINES_H /** * =========================================================================== * build switch * =========================================================================== */ /* --------------------------------------------------------------------------- * build */ #define RELEASE_BUILD 1 /* 1: release build */ #define CTRL_AEC_THREAD 0 /* AEC and reconstruct conducted in different threads */ #define CTRL_AEC_CONVERSION 0 /* AEC result conversion */ /* --------------------------------------------------------------------------- * debug */ #if RELEASE_BUILD #define AVS2_TRACE 0 /* write trace file, 1: ON, 0: OFF */ #else #define AVS2_TRACE 0 /* write trace file, 1: ON, 0: OFF */ #endif #define DAVS2_TRACE_API 0 /* API calling trace */ #define USE_NEW_INTPL 0 /* use new interpolation functions */ #define BUGFIX_PREDICTION_INTRA 1 /* align to latest intra prediction */ /** * =========================================================================== * define of const variables * =========================================================================== */ /* --------------------------------------------------------------------------- * profile */ #define MAIN_PICTURE_PROFILE 0x12 #define MAIN_PROFILE 0x20 #define MAIN10_PROFILE 0x22 enum chroma_format_e { CHROMA_400 = 0, CHROMA_420 = 1, CHROMA_422 = 2, CHROMA_444 = 3 }; /* --------------------------------------------------------------------------- * prediction techniques */ #define DMH_MODE_NUM 5 /* number of DMH mode */ /* --------------------------------------------------------------------------- * SAO */ #define MAX_NUM_SAO_CLASSES 32 #define NUM_SAO_BO_CLASSES_LOG2 5 #define NUM_SAO_BO_CLASSES_IN_BIT 5 #define NUM_SAO_EO_TYPES_LOG2 2 #define SAO_SHIFT_PIX_NUM 4 /* --------------------------------------------------------------------------- * ALF parameters */ #define ALF_NUM_VARS 16 #define ALF_MAX_NUM_COEF 9 #define LOG2_VAR_SIZE_H 2 #define LOG2_VAR_SIZE_W 2 #define ALF_FOOTPRINT_SIZE 7 #define DF_CHANGED_SIZE 3 #define ALF_NUM_BIT_SHIFT 6 /* --------------------------------------------------------------------------- * Quantization parameter range */ #define MIN_QP 0 #if HIGH_BIT_DEPTH #define MAX_QP 79 /* max QP */ #else #define MAX_QP 63 /* max QP */ #endif #define SHIFT_QP 11 /* --------------------------------------------------------------------------- * block sizes */ #define MAX_CU_SIZE 64 /* 64x64 */ #define MAX_CU_SIZE_IN_BIT 6 #define MIN_CU_SIZE 8 /* 8x8 */ #define MIN_CU_SIZE_IN_BIT 3 #define MIN_PU_SIZE 4 /* 4x4 */ #define MIN_PU_SIZE_IN_BIT 2 #define BLOCK_MULTIPLE (MIN_CU_SIZE/MIN_PU_SIZE) #define B4X4_IN_BIT 2 #define B8X8_IN_BIT 3 #define B16X16_IN_BIT 4 #define B32X32_IN_BIT 5 #define B64X64_IN_BIT 6 /* --------------------------------------------------------------------------- * luma intra prediction modes */ enum intra_pred_mode_e { /* non-angular mode */ DC_PRED = 0 , /* prediction mode: DC */ PLANE_PRED = 1 , /* prediction mode: PLANE */ BI_PRED = 2 , /* prediction mode: BI */ /* vertical angular mode */ INTRA_ANG_X_3 = 3, INTRA_ANG_X_4 = 4, INTRA_ANG_X_5 = 5, INTRA_ANG_X_6 = 6, INTRA_ANG_X_7 = 7, INTRA_ANG_X_8 = 8, INTRA_ANG_X_9 = 9, INTRA_ANG_X_10 = 10, INTRA_ANG_X_11 = 11, INTRA_ANG_X_12 = 12, VERT_PRED = INTRA_ANG_X_12, /* prediction mode: VERT */ /* vertical + horizontal angular mode */ INTRA_ANG_XY_13 = 13, INTRA_ANG_XY_14 = 14, INTRA_ANG_XY_15 = 15, INTRA_ANG_XY_16 = 16, INTRA_ANG_XY_17 = 17, INTRA_ANG_XY_18 = 18, INTRA_ANG_XY_19 = 19, INTRA_ANG_XY_20 = 20, INTRA_ANG_XY_21 = 21, INTRA_ANG_XY_22 = 22, INTRA_ANG_XY_23 = 23, /* horizontal angular mode */ INTRA_ANG_Y_24 = 24, INTRA_ANG_Y_25 = 25, INTRA_ANG_Y_26 = 26, INTRA_ANG_Y_27 = 27, INTRA_ANG_Y_28 = 28, INTRA_ANG_Y_29 = 29, INTRA_ANG_Y_30 = 30, INTRA_ANG_Y_31 = 31, INTRA_ANG_Y_32 = 32, HOR_PRED = INTRA_ANG_Y_24, /* prediction mode: HOR */ NUM_INTRA_MODE = 33, /* number of luma intra prediction modes */ }; /* --------------------------------------------------------------------------- * chroma intra prediction modes */ enum intra_chroma_pred_mode_e { /* chroma intra prediction modes */ DM_PRED_C = 0, /* prediction mode: DM */ DC_PRED_C = 1, /* prediction mode: DC */ HOR_PRED_C = 2, /* prediction mode: HOR */ VERT_PRED_C = 3, /* prediction mode: VERT */ BI_PRED_C = 4, /* prediction mode: BI */ NUM_INTRA_MODE_CHROMA = 5, /* number of chroma intra prediction modes */ }; /* --------------------------------------------------------------------------- * mv predicating */ #define MVPRED_xy_MIN 0 #define MVPRED_L 1 #define MVPRED_U 2 #define MVPRED_UR 3 /* --------------------------------------------------------------------------- * mv predicating direction */ #define PDIR_FWD 0 #define PDIR_BWD 1 #define PDIR_SYM 2 #define PDIR_BID 3 #define PDIR_DUAL 4 #define PDIR_INVALID -1 /* invalid predicating direction */ /* --------------------------------------------------------------------------- * unification of MV scaling */ #define MULTI 16384 #define HALF_MULTI 8192 #define OFFSET 14 /* --------------------------------------------------------------------------- * motion information storage compression */ #define MV_DECIMATION_FACTOR 4 /* store the middle pixel's mv in a motion information unit */ #define MV_FACTOR_IN_BIT 2 /* --------------------------------------------------------------------------- * for 16-BITS transform */ #define LIMIT_BIT 16 /* --------------------------------------------------------------------------- * max value */ #define AVS2_THREAD_MAX 16 /* max number of threads */ #define DAVS2_WORK_MAX 128 /* max number of works (thread queue) */ #define AVS2_MAX_REFS 4 /* max reference frame number */ #define AVS2_GOP_NUM 32 /* max GOP number */ #define AVS2_COI_CYCLE 256 /* COI ranges from [0, 255] */ #define MAX_POC_DISTANCE 128 /* max POC distance */ #define INVALID_FRAME -1 /* invalid value for COI & POC */ #define CG_SIZE 16 /* size of an coefficient group, 4x4 */ #define TEMPORAL_MAXLEVEL_BIT 3 /* bit number of temporal_id */ #define THRESHOLD_PMVR 2 /* threshold for pmvr */ #define MAX_ES_FRAME_SIZE 4000000 /* default max es frame size: 4MB */ #define MAX_ES_FRAME_NUM 64 /* default number of es frames */ #define AVS2_PAD (64 + 16) /* number of pixels padded around the reference frame */ #define DAVS2_MAX_LCU_ROWS 256 /* maximum number of LCU rows of one frame */ /* --------------------------------------------------------------------------- * aec */ #define SE_CHROMA 1 /* context for read (run, level) */ #define SE_LUMA_8x8 2 /* context for read (run, level) */ /* --------------------------------------------------------------------------- * transform */ #define SEC_TR_SIZE 4 /* block size of 2nd transform */ /* --------------------------------------------------------------------------- * CPU flags */ /* x86 */ #define DAVS2_CPU_CMOV 0x0000001 #define DAVS2_CPU_MMX 0x0000002 #define DAVS2_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ #define DAVS2_CPU_MMXEXT DAVS2_CPU_MMX2 #define DAVS2_CPU_SSE 0x0000008 #define DAVS2_CPU_SSE2 0x0000010 #define DAVS2_CPU_SSE3 0x0000020 #define DAVS2_CPU_SSSE3 0x0000040 #define DAVS2_CPU_SSE4 0x0000080 /* SSE4.1 */ #define DAVS2_CPU_SSE42 0x0000100 /* SSE4.2 */ #define DAVS2_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ #define DAVS2_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ #define DAVS2_CPU_XOP 0x0000800 /* AMD XOP */ #define DAVS2_CPU_FMA4 0x0001000 /* AMD FMA4 */ #define DAVS2_CPU_AVX2 0x0002000 /* AVX2 */ #define DAVS2_CPU_FMA3 0x0004000 /* Intel FMA3 */ #define DAVS2_CPU_BMI1 0x0008000 /* BMI1 */ #define DAVS2_CPU_BMI2 0x0010000 /* BMI2 */ /* x86 modifiers */ #define DAVS2_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ #define DAVS2_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ #define DAVS2_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ #define DAVS2_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ #define DAVS2_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ #define DAVS2_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ #define DAVS2_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ #define DAVS2_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow * SIMD multiplies, slow SIMD variable shifts, slow pshufb, * cacheline split penalties -- gather everything here that * isn't shared by other CPUs to avoid making half a dozen * new SLOW flags. */ #define DAVS2_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ #define DAVS2_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ /* ARM */ #define DAVS2_CPU_ARMV6 0x0000001 #define DAVS2_CPU_NEON 0x0000002 /* ARM NEON */ #define DAVS2_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ /* --------------------------------------------------------------------------- * others */ #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #define FAST_GET_SPS 1 /* get SPS as soon as possible */ /* --------------------------------------------------------------------------- * all assembly and related C functions are prefixed with 'staravs_' default */ #define PFXB(prefix, name) prefix ## _ ## name #define PFXA(prefix, name) PFXB(prefix, name) #define FPFX(name) PFXA(davs2, name) /* --------------------------------------------------------------------------- * flag */ #define AVS2_EXIT_THREAD (-1) /* flag to terminate thread */ /* --------------------------------------------------------------------------- * if hdr chroma qp open */ #define HDR_CHROMA_DELTA_QP 0 #endif // DAVS2_DEFINES_H davs2-1.6/source/common/frame.cc000066400000000000000000000444111337322544400166030ustar00rootroot00000000000000/* * frame.cc * * Description of this file: * Frame handling functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "frame.h" #include "header.h" /** * =========================================================================== * border expanding * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void pad_line_pixel(pel_t *pix, int width, int num_pad) { pel4_t *p_l4 = (pel4_t *)(pix - num_pad); pel4_t *p_r4 = (pel4_t *)(pix + width); pel4_t l4 = pix[0]; pel4_t r4 = pix[width - 1]; #if ARCH_X86_64 && !HIGH_BIT_DEPTH uint64_t *p_l64 = (uint64_t *)p_l4; uint64_t *p_r64 = (uint64_t *)p_r4; uint64_t l64; uint64_t r64; #endif #if HIGH_BIT_DEPTH l4 = (l4 << 48) | (l4 << 32) | (l4 << 16) | l4; r4 = (r4 << 48) | (r4 << 32) | (r4 << 16) | r4; #else l4 = (l4 << 24) | (l4 << 16) | (l4 << 8) | l4; r4 = (r4 << 24) | (r4 << 16) | (r4 << 8) | r4; #if ARCH_X86_64 l64 = ((uint64_t)(l4) << 32) | l4; r64 = ((uint64_t)(r4) << 32) | r4; #endif #endif #if ARCH_X86_64 && !HIGH_BIT_DEPTH assert((num_pad & 7) == 0); num_pad >>= 3; for (; num_pad != 0; num_pad--) { *p_l64++ = l64; /* pad left */ *p_r64++ = r64; /* pad right */ } #else assert((num_pad & 3) == 0); num_pad >>= 2; for (; num_pad != 0; num_pad--) { *p_l4++ = l4; /* pad left */ *p_r4++ = r4; /* pad right */ } #endif } /* --------------------------------------------------------------------------- */ void pad_line_lcu(davs2_t *h, int lcu_y) { davs2_frame_t *frame = h->fdec; int i, j; for (i = 0; i < 3; i++) { int chroma_shift = !!i; int start = ((lcu_y + 0) << h->i_lcu_level) >> chroma_shift; ///< -4 for ALF int end = ((lcu_y + 1) << h->i_lcu_level) >> chroma_shift; int i_stride = frame->i_stride[i]; int i_width = frame->i_width[i]; const int num_pad = AVS2_PAD >> chroma_shift; pel_t *pix; if (lcu_y > 0) { start -= 4; } if (lcu_y < h->i_height_in_lcu - 1) { end -= 4; } /* padding these rows */ for (j = start; j < end; j++) { pix = frame->planes[i] + j * i_stride; pad_line_pixel(pix, i_width, num_pad); } /* for the first row, padding the rows above the picture edges */ if (lcu_y == 0) { pix = frame->planes[i] - (num_pad); for (j = 0; j < (num_pad); j++) { gf_davs2.memcpy_aligned(pix - i_stride, pix, i_stride * sizeof(pel_t)); pix -= i_stride; } } /* for the last row, padding the rows under of the picture edges */ if (lcu_y == h->i_height_in_lcu - 1) { pix = frame->planes[i] + (frame->i_lines[i] - 1) * i_stride - (num_pad); for (j = 0; j < (num_pad); j++) { gf_davs2.memcpy_aligned(pix + i_stride, pix, i_stride * sizeof(pel_t)); pix += i_stride; } } } } /** * =========================================================================== * memory handling * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int align_stride(int x, int align, int disalign) { x = DAVS2_ALIGN(x, align); if (!(x & (disalign - 1))) { x += align; } return x; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int align_plane_size(int x, int disalign) { if (!(x & (disalign - 1))) { x += 128; } return x; } /* --------------------------------------------------------------------------- */ size_t davs2_frame_get_size(int width, int height, int chroma_format, int b_extra) { const int width_c = width >> 1; const int height_c = height >> (chroma_format == CHROMA_420 ? 1 : 0); const int width_in_spu = width >> MIN_PU_SIZE_IN_BIT; const int height_in_spu = height >> MIN_PU_SIZE_IN_BIT; const int max_lcu_height = (height + (1 << 4) - 1) >> 4; /* frame height in 16x16 LCU */ const int align = 32; const int disalign = 1 << 16; int extra_buf_size = 0; /* extra buffer size */ int stride_l, stride_c; int size_l, size_c; /* size of luma and chroma plane */ size_t mem_size; /* total memory size */ /* need extra buffer? */ if (b_extra) { /* reference information buffer size (in SPU) */ extra_buf_size = width_in_spu * height_in_spu; } /* compute stride and the plane size * +PAD for extra data for MC */ stride_l = align_stride(width + AVS2_PAD * 2, align, disalign); stride_c = align_stride(width_c + AVS2_PAD, align, disalign); size_l = align_plane_size(stride_l * (height + AVS2_PAD * 2) + CACHE_LINE_SIZE, disalign); size_c = align_plane_size(stride_c * (height_c + AVS2_PAD) + CACHE_LINE_SIZE, disalign); /* compute space size and alloc memory */ mem_size = sizeof(davs2_frame_t) + /* M0, size of frame handle */ sizeof(pel_t) * (size_l + size_c * 2) + /* M1, size of planes buffer: Y+U+V */ sizeof(int8_t) * extra_buf_size + /* M2, size of SPU reference index buffer */ sizeof(mv_t) * extra_buf_size + /* M3, size of SPU motion vector buffer */ sizeof(davs2_thread_cond_t) * max_lcu_height + /* M4, condition variables for each LCU line */ sizeof(int) * max_lcu_height + /* M5, LCU decoding status */ CACHE_LINE_SIZE * 6; return mem_size; } /* --------------------------------------------------------------------------- */ davs2_frame_t *davs2_frame_new(int width, int height, int chroma_format, uint8_t **mem_base, int b_extra) { const int width_c = width >> 1; const int height_c = height >> (chroma_format == CHROMA_420 ? 1 : 0); const int width_in_spu = width >> MIN_PU_SIZE_IN_BIT; const int height_in_spu = height >> MIN_PU_SIZE_IN_BIT; const int max_lcu_height = (height + (1 << 4) - 1) / (1 << 4); /* frame height in 16x16 LCU */ const int align = 32; const int disalign = 1 << 16; int extra_buf_size = 0; /* extra buffer size */ int stride_l, stride_c; int size_l, size_c; /* size of luma and chroma plane */ int i, mem_size; /* total memory size */ davs2_frame_t *frame; uint8_t *mem_ptr; /* need extra buffer? */ if (b_extra) { /* reference information buffer size (in SPU) */ extra_buf_size = width_in_spu * height_in_spu; } /* compute stride and the plane size * +PAD for extra data for MC */ stride_l = align_stride(width + AVS2_PAD * 2, align, disalign); stride_c = align_stride(width_c + AVS2_PAD, align, disalign); size_l = align_plane_size(stride_l * (height + AVS2_PAD * 2) + CACHE_LINE_SIZE, disalign); size_c = align_plane_size(stride_c * (height_c + AVS2_PAD) + CACHE_LINE_SIZE, disalign); /* compute space size and alloc memory */ mem_size = sizeof(davs2_frame_t) + /* M0, size of frame handle */ sizeof(pel_t) * (size_l + size_c * 2) + /* M1, size of planes buffer: Y+U+V */ sizeof(int8_t) * extra_buf_size + /* M2, size of SPU reference index buffer */ sizeof(mv_t) * extra_buf_size + /* M3, size of SPU motion vector buffer */ sizeof(davs2_thread_cond_t) * max_lcu_height + /* M4, condition variables for each LCU line */ sizeof(int) * max_lcu_height + /* M5, LCU decoding status */ CACHE_LINE_SIZE * 8; if (mem_base == NULL) { CHECKED_MALLOC(mem_ptr, uint8_t *, mem_size); } else { mem_ptr = *mem_base; } /* M0, frame handle */ frame = (davs2_frame_t *)mem_ptr; memset(frame, 0, sizeof(davs2_frame_t)); mem_ptr += sizeof(davs2_frame_t); ALIGN_POINTER(mem_ptr); /* set frame properties */ frame->i_plane = 3; /* planes: Y+U+V */ frame->i_width [0] = width; frame->i_lines [0] = height; frame->i_stride[0] = stride_l; frame->i_width [1] = frame->i_width [2] = width_c; frame->i_lines [1] = frame->i_lines [2] = height_c; frame->i_stride[1] = frame->i_stride[2] = stride_c; frame->i_type = -1; frame->i_pts = -1; frame->i_coi = INVALID_FRAME; frame->i_poc = INVALID_FRAME; frame->b_refered_by_others = 0; /* M1, buffer for planes: Y+U+V */ frame->planes[0] = (pel_t *)mem_ptr; frame->planes[1] = frame->planes[0] + size_l; frame->planes[2] = frame->planes[1] + size_c; mem_ptr += sizeof(pel_t) * (size_l + size_c * 2); /* point to plane data area */ frame->planes[0] += frame->i_stride[0] * (AVS2_PAD ) + (AVS2_PAD ); frame->planes[1] += frame->i_stride[1] * (AVS2_PAD / 2) + (AVS2_PAD / 2); frame->planes[2] += frame->i_stride[2] * (AVS2_PAD / 2) + (AVS2_PAD / 2); ALIGN_POINTER(frame->planes[0]); ALIGN_POINTER(frame->planes[1]); ALIGN_POINTER(frame->planes[2]); if (b_extra) { /* M2, reference index buffer (in SPU) */ frame->refbuf = (int8_t *)mem_ptr; mem_ptr += sizeof(int8_t) * extra_buf_size; ALIGN_POINTER(mem_ptr); /* M3, motion vector buffer (in SPU) */ frame->mvbuf = (mv_t *)mem_ptr; mem_ptr += sizeof(mv_t) * extra_buf_size; ALIGN_POINTER(mem_ptr); } /* M4 */ frame->conds_lcu_row = (davs2_thread_cond_t *)mem_ptr; mem_ptr += sizeof(davs2_thread_cond_t) * max_lcu_height; ALIGN_POINTER(mem_ptr); /* M5 */ frame->num_decoded_lcu_in_row = (int *)mem_ptr; mem_ptr += sizeof(int) * max_lcu_height; ALIGN_POINTER(mem_ptr); assert(mem_ptr - (uint8_t *)frame <= mem_size); /* update mem_base */ if (mem_base != NULL) { *mem_base = mem_ptr; frame->is_self_malloc = 0; } else { frame->is_self_malloc = 1; } frame->i_conds = max_lcu_height; frame->i_decoded_line = -1; frame->i_ref_count = 0; frame->i_disposable = 0; for (i = 0; i < frame->i_conds; i++) { if (davs2_thread_cond_init(&frame->conds_lcu_row[i], NULL)) { goto fail; } } davs2_thread_cond_init(&frame->cond_aec, NULL); davs2_thread_mutex_init(&frame->mutex_frm, NULL); davs2_thread_mutex_init(&frame->mutex_recon, NULL); return frame; fail: if (mem_ptr) { davs2_free(mem_ptr); } return NULL; } /* --------------------------------------------------------------------------- */ void davs2_frame_destroy(davs2_frame_t *frame) { int i; if (frame == NULL) { return; } davs2_thread_mutex_destroy(&frame->mutex_frm); davs2_thread_mutex_destroy(&frame->mutex_recon); for (i = 0; i < frame->i_conds; i++) { davs2_thread_cond_destroy(&frame->conds_lcu_row[i]); } /* free the frame itself */ if (frame->is_self_malloc) { davs2_free(frame); } } /* --------------------------------------------------------------------------- */ void davs2_frame_copy_planes(davs2_frame_t *p_dst, davs2_frame_t *p_src) { /* copy frame properties */ memcpy(p_dst, p_src, (uint8_t *)&p_src->i_ref_count - (uint8_t *)p_src); /* copy all plane data */ #if 1 /* ʹöַڴ濽һδؿ */ assert(p_src->i_stride[0] == p_dst->i_stride[0]); assert(p_src->i_stride[1] == p_dst->i_stride[1]); assert(p_src->i_stride[2] == p_dst->i_stride[2]); gf_davs2.memcpy_aligned(p_dst->planes[0], p_src->planes[0], p_src->i_stride[0] * p_src->i_lines[0] * sizeof(pel_t)); gf_davs2.memcpy_aligned(p_dst->planes[1], p_src->planes[1], p_src->i_stride[1] * p_src->i_lines[1] * sizeof(pel_t)); gf_davs2.memcpy_aligned(p_dst->planes[2], p_src->planes[2], p_src->i_stride[2] * p_src->i_lines[2] * sizeof(pel_t)); #else gf_davs2.plane_copy(p_dst->planes[0], p_dst->i_stride[0], p_src->planes[0], p_src->i_stride[0], p_src->i_width[0], p_src->i_lines[0]); gf_davs2.plane_copy(p_dst->planes[1], p_dst->i_stride[1], p_src->planes[1], p_src->i_stride[1], p_src->i_width[1], p_src->i_lines[1]); gf_davs2.plane_copy(p_dst->planes[2], p_dst->i_stride[2], p_src->planes[2], p_src->i_stride[2], p_src->i_width[2], p_src->i_lines[2]); #endif } /* --------------------------------------------------------------------------- * copy frame properties */ void davs2_frame_copy_properties(davs2_frame_t *p_dst, davs2_frame_t *p_src) { memcpy(p_dst, p_src, (uint8_t *)&p_src->i_ref_count - (uint8_t *)p_src); } /* --------------------------------------------------------------------------- */ void davs2_frame_copy_lcu(davs2_t *h, davs2_frame_t *p_dst, davs2_frame_t *p_src, int i_lcu_x, int i_lcu_y, int pix_offset, int padding_size) { int pix_y = (i_lcu_y << h->i_lcu_level) + pix_offset; int pix_x = (i_lcu_x << h->i_lcu_level) + pix_offset; int lcu_width = DAVS2_MIN(h->i_lcu_size, h->i_width - pix_x); int lcu_height = DAVS2_MIN(h->i_lcu_size, h->i_height - pix_y); int y, len, stride; pel_t *src, *dst; /* Y */ stride = p_src->i_stride[0]; src = p_src->planes[0] + pix_y * stride + pix_x; dst = p_dst->planes[0] + pix_y * stride + pix_x; len = lcu_width * sizeof(pel_t); for (y = 0; y < lcu_height; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[0], padding_size); } src += stride; dst += stride; } pix_y = (i_lcu_y << (h->i_lcu_level - 1)) + pix_offset; pix_x = (i_lcu_x << (h->i_lcu_level - 1)) + pix_offset; lcu_height >>= 1; /* U */ stride = p_src->i_stride[1]; src = p_src->planes[1] + pix_y * stride + pix_x; dst = p_dst->planes[1] + pix_y * stride + pix_x; len = lcu_width * sizeof(pel_t); for (y = 0; y < lcu_height; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[1], padding_size); } src += stride; dst += stride; } /* V */ stride = p_src->i_stride[2]; src = p_src->planes[2] + pix_y * stride + pix_x; dst = p_dst->planes[2] + pix_y * stride + pix_x; len = lcu_width * sizeof(pel_t); for (y = 0; y < lcu_height; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[1], padding_size); } src += stride; dst += stride; } } /* --------------------------------------------------------------------------- * padding_size - padding size for left and right edges */ void davs2_frame_copy_lcurow(davs2_t *h, davs2_frame_t *p_dst, davs2_frame_t *p_src, int i_lcu_y, int pix_offset, int padding_size) { int pix_y = (i_lcu_y << h->i_lcu_level) + pix_offset; int lcu_h = DAVS2_MIN(h->i_height, ((i_lcu_y + 1) << h->i_lcu_level)) - pix_y; int y, len, stride; pel_t *src, *dst; /* Y */ stride = p_src->i_stride[0]; src = p_src->planes[0] + pix_y * stride; dst = p_dst->planes[0] + pix_y * stride; len = p_src->i_width[0] * sizeof(pel_t); for (y = 0; y < lcu_h; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[0], padding_size); } src += stride; dst += stride; } pix_y = (i_lcu_y << (h->i_lcu_level - 1)) + pix_offset; lcu_h = DAVS2_MIN(h->i_height >> 1, ((i_lcu_y + 1) << (h->i_lcu_level - 1))) - pix_y; /* U */ stride = p_src->i_stride[1]; src = p_src->planes[1] + pix_y * stride; dst = p_dst->planes[1] + pix_y * stride; len = p_src->i_width[1] * sizeof(pel_t); for (y = 0; y < lcu_h; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[1], padding_size); } src += stride; dst += stride; } /* V */ stride = p_src->i_stride[2]; src = p_src->planes[2] + pix_y * stride; dst = p_dst->planes[2] + pix_y * stride; len = p_src->i_width[2] * sizeof(pel_t); for (y = 0; y < lcu_h; y++) { gf_davs2.fast_memcpy(dst, src, len); if (padding_size) { pad_line_pixel(dst, p_dst->i_width[2], padding_size); } src += stride; dst += stride; } } davs2-1.6/source/common/frame.h000066400000000000000000000055431337322544400164500ustar00rootroot00000000000000/* * frame.h * * Description of this file: * Frame handling functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_FRAME_H #define DAVS2_FRAME_H #ifdef __cplusplus extern "C" { #endif /** * =========================================================================== * function declares * =========================================================================== */ #define davs2_frame_get_size FPFX(frame_get_size) size_t davs2_frame_get_size(int width, int height, int chroma_format, int b_extra); #define davs2_frame_new FPFX(frame_new) davs2_frame_t *davs2_frame_new(int width, int height, int chroma_format, uint8_t **mem_base, int b_extra); #define davs2_frame_destroy FPFX(frame_destroy) void davs2_frame_destroy(davs2_frame_t *frame); #define davs2_frame_copy_planes FPFX(frame_copy_planes) void davs2_frame_copy_planes(davs2_frame_t *p_dst, davs2_frame_t *p_src); #define davs2_frame_copy_properties FPFX(frame_copy_properties) void davs2_frame_copy_properties(davs2_frame_t *p_dst, davs2_frame_t *p_src); #define davs2_frame_copy_lcu FPFX(frame_copy_lcu) void davs2_frame_copy_lcu(davs2_t *h, davs2_frame_t *p_dst, davs2_frame_t *p_src, int i_lcu_x, int i_lcu_y, int pix_offset, int padding_size); #define davs2_frame_copy_lcurow FPFX(frame_copy_lcurow) void davs2_frame_copy_lcurow(davs2_t *h, davs2_frame_t *p_dst, davs2_frame_t *p_src, int i_lcu_y, int pix_offset, int padding_size); #define davs2_frame_expand_border FPFX(frame_expand_border) void davs2_frame_expand_border(davs2_frame_t *frame); #define pad_line_lcu FPFX(pad_line_lcu) void pad_line_lcu(davs2_t *h, int lcu_y); #ifdef __cplusplus } #endif #endif /* DAVS2_FRAME_H */ davs2-1.6/source/common/header.cc000066400000000000000000001504671337322544400167520ustar00rootroot00000000000000/* * header.cc * * Description of this file: * Header functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "davs2.h" #include "transform.h" #include "vlc.h" #include "header.h" #include "aec.h" #include "alf.h" #include "quant.h" #include "bitstream.h" #include "decoder.h" #include "frame.h" #include "predict.h" #include "quant.h" #include "cpu.h" /** * =========================================================================== * const variable defines * =========================================================================== */ extern const int8_t *tab_DL_Avails[MAX_CU_SIZE_IN_BIT + 1]; extern const int8_t *tab_TR_Avails[MAX_CU_SIZE_IN_BIT + 1]; static const uint8_t ALPHA_TABLE[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 18, 20, 22, 24, 26, 28, 30, 33, 33, 35, 35, 36, 37, 37, 39, 39, 42, 44, 46, 48, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 }; /* --------------------------------------------------------------------------- */ static const uint8_t BETA_TABLE[64] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10, 10, 11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 24, 25, 25, 26, 27 }; /* --------------------------------------------------------------------------- * extension id */ enum extension_id_e { SEQUENCE_DISPLAY_EXTENSION_ID = 2, TEMPORAL_SCALABLE_EXTENSION_ID = 3, COPYRIGHT_EXTENSION_ID = 4, PICTURE_DISPLAY_EXTENSION_ID = 7, CAMERAPARAMETERS_EXTENSION_ID = 11, LOCATION_DATA_EXTENSION_ID = 15 }; #define ROI_DATA_FILE "roi.dat" // ROI location data output static bool_t open_dbp_buffer_warning = 1; /** * =========================================================================== * local function defines * =========================================================================== */ static INLINE int is_valid_qp(davs2_t *h, int i_qp) { return i_qp >= 0 && i_qp <= (63 + 8 * (h->sample_bit_depth - 8)); } /* --------------------------------------------------------------------------- */ static void davs2_reconfigure_decoder(davs2_mgr_t *h) { UNUSED_PARAMETER(h); } /* --------------------------------------------------------------------------- * sequence header */ static int parse_sequence_header(davs2_mgr_t *mgr, davs2_seq_t *seq, davs2_bs_t *bs) { static const float FRAME_RATE[8] = { 24000.0f / 1001.0f, 24.0f, 25.0f, 30000.0f / 1001.0f, 30.0f, 50.0f, 60000.0f / 1001.0f, 60.0f }; rps_t *p_rps = NULL; int i, j; int num_of_rps; bs->i_bit_pos += 32; /* skip start code */ memset(seq, 0, sizeof(davs2_seq_t)); // reset all value seq->head.profile_id = u_v(bs, 8, "profile_id"); seq->head.level_id = u_v(bs, 8, "level_id"); seq->head.progressive = u_v(bs, 1, "progressive_sequence"); seq->b_field_coding = u_flag(bs, "field_coded_sequence"); seq->head.width = u_v(bs, 14, "horizontal_size"); seq->head.height = u_v(bs, 14, "vertical_size"); if (seq->head.width < 16 || seq->head.height < 16) { return -1; } seq->head.chroma_format = u_v(bs, 2, "chroma_format"); if (seq->head.chroma_format != CHROMA_420 && seq->head.chroma_format != CHROMA_400) { return -1; } if (seq->head.chroma_format == CHROMA_400) { davs2_log(mgr, DAVS2_LOG_WARNING, "Un-supported Chroma Format YUV400 as 0 for GB/T.\n"); } /* sample bit depth */ if (seq->head.profile_id == MAIN10_PROFILE) { seq->sample_precision = u_v(bs, 3, "sample_precision"); seq->encoding_precision = u_v(bs, 3, "encoding_precision"); } else { seq->sample_precision = u_v(bs, 3, "sample_precision"); seq->encoding_precision = 1; } if (seq->sample_precision < 1 || seq->sample_precision > 3 || seq->encoding_precision < 1 || seq->encoding_precision > 3) { return -1; } seq->head.internal_bit_depth = 6 + (seq->encoding_precision << 1); seq->head.output_bit_depth = 6 + (seq->encoding_precision << 1); seq->head.bytes_per_sample = seq->head.output_bit_depth > 8 ? 2 : 1; /* */ seq->head.aspect_ratio = u_v(bs, 4, "aspect_ratio_information"); seq->head.frame_rate_id = u_v(bs, 4, "frame_rate_id"); seq->bit_rate_lower = u_v(bs, 18, "bit_rate_lower"); u_v(bs, 1, "marker bit"); seq->bit_rate_upper = u_v(bs, 12, "bit_rate_upper"); seq->head.low_delay = u_v(bs, 1, "low_delay"); u_v(bs, 1, "marker bit"); seq->b_temporal_id_exist = u_flag(bs, "temporal_id exist flag"); // get Extension Flag u_v(bs, 18, "bbv buffer size"); seq->log2_lcu_size = u_v(bs, 3, "Largest Coding Block Size"); if (seq->log2_lcu_size < 4 || seq->log2_lcu_size > 6) { davs2_log(mgr, DAVS2_LOG_ERROR, "Invalid LCU size: %d\n", seq->log2_lcu_size); return -1; } seq->enable_weighted_quant = u_flag(bs, "enable_weighted_quant"); if (seq->enable_weighted_quant) { int load_seq_wquant_data_flag; int x, y, sizeId, uiWqMSize; const int *Seq_WQM; load_seq_wquant_data_flag = u_flag(bs, "load_seq_weight_quant_data_flag"); for (sizeId = 0; sizeId < 2; sizeId++) { uiWqMSize = DAVS2_MIN(1 << (sizeId + 2), 8); if (load_seq_wquant_data_flag == 1) { for (y = 0; y < uiWqMSize; y++) { for (x = 0; x < uiWqMSize; x++) { seq->seq_wq_matrix[sizeId][y * uiWqMSize + x] = (int16_t)ue_v(bs, "weight_quant_coeff"); } } } else if (load_seq_wquant_data_flag == 0) { Seq_WQM = wq_get_default_matrix(sizeId); for (i = 0; i < (uiWqMSize * uiWqMSize); i++) { seq->seq_wq_matrix[sizeId][i] = (int16_t)Seq_WQM[i]; } } } } seq->enable_background_picture = u_flag(bs, "background_picture_disable") ^ 0x01; seq->enable_mhp_skip = u_flag(bs, "mhpskip enabled"); seq->enable_dhp = u_flag(bs, "dhp enabled"); seq->enable_wsm = u_flag(bs, "wsm enabled"); seq->enable_amp = u_flag(bs, "Asymmetric Motion Partitions"); seq->enable_nsqt = u_flag(bs, "use NSQT"); seq->enable_sdip = u_flag(bs, "use NSIP"); seq->enable_2nd_transform = u_flag(bs, "secT enabled"); seq->enable_sao = u_flag(bs, "SAO Enable Flag"); seq->enable_alf = u_flag(bs, "ALF Enable Flag"); seq->enable_pmvr = u_flag(bs, "pmvr enabled"); if (1 != u_v(bs, 1, "marker bit")) { davs2_log(mgr, DAVS2_LOG_ERROR, "expected marker_bit 1 while received 0, FILE %s, Row %d\n", __FILE__, __LINE__); } num_of_rps = u_v(bs, 6, "num_of_RPS"); if (num_of_rps > AVS2_GOP_NUM) { return -1; } seq->num_of_rps = num_of_rps; for (i = 0; i < num_of_rps; i++) { p_rps = &seq->seq_rps[i]; p_rps->refered_by_others = u_v(bs, 1, "refered by others"); p_rps->num_of_ref = u_v(bs, 3, "num of reference picture"); for (j = 0; j < p_rps->num_of_ref; j++) { p_rps->ref_pic[j] = u_v(bs, 6, "delta COI of ref pic"); } p_rps->num_to_remove = u_v(bs, 3, "num of removed picture"); for (j = 0; j < p_rps->num_to_remove; j++) { p_rps->remove_pic[j] = u_v(bs, 6, "delta COI of removed pic"); } if (1 != u_v(bs, 1, "marker bit")) { davs2_log(mgr, DAVS2_LOG_ERROR, "expected marker_bit 1 while received 0, FILE %s, Row %d\n", __FILE__, __LINE__); } } if (seq->head.low_delay == 0) { seq->picture_reorder_delay = u_v(bs, 5, "picture_reorder_delay"); } seq->cross_loop_filter_flag = u_flag(bs, "Cross Loop Filter Flag"); u_v(bs, 2, "reserved bits"); bs_align(bs); /* align position */ seq->head.bitrate = ((seq->bit_rate_upper << 18) + seq->bit_rate_lower) * 400; seq->head.frame_rate = FRAME_RATE[seq->head.frame_rate_id - 1]; seq->i_enc_width = ((seq->head.width + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; seq->i_enc_height = ((seq->head.height + MIN_CU_SIZE - 1) >> MIN_CU_SIZE_IN_BIT) << MIN_CU_SIZE_IN_BIT; seq->valid_flag = 1; return 0; } /* --------------------------------------------------------------------------- * init deblock parame of one frame */ static INLINE void deblock_init_frame_parames(davs2_t *h) { int shift = h->sample_bit_depth - 8; int QP = h->i_picture_qp - (shift << 3); int QP_c = cu_get_chroma_qp(h, h->i_picture_qp, 0) - (shift << 3); h->alpha = ALPHA_TABLE[DAVS2_CLIP3(0, 63, QP + h->i_alpha_offset)] << shift; h->beta = BETA_TABLE[DAVS2_CLIP3(0, 63, QP + h->i_beta_offset)] << shift; h->alpha_c = ALPHA_TABLE[DAVS2_CLIP3(0, 63, QP_c + h->i_alpha_offset)] << shift; h->beta_c = BETA_TABLE[DAVS2_CLIP3(0, 63, QP_c + h->i_beta_offset)] << shift; if (gf_davs2.set_deblock_const != NULL) { gf_davs2.set_deblock_const(); } } /* --------------------------------------------------------------------------- * Intra picture header */ static int parse_picture_header_intra(davs2_t *h, davs2_bs_t *bs) { int time_code_flag; int progressive_frame; int predict; int i; h->i_frame_type = AVS2_I_SLICE; /* skip start code */ bs->i_bit_pos += 32; u_v(bs, 32, "bbv_delay"); time_code_flag = u_v(bs, 1, "time_code_flag"); if (time_code_flag) { /* time_code = */ u_v(bs, 24, "time_code"); } if (h->b_bkgnd_picture) { int background_picture_flag = u_v(bs, 1, "background_picture_flag"); if (background_picture_flag) { int b_output = u_v(bs, 1, "background_picture_output_flag"); if (b_output) { h->i_frame_type = AVS2_G_SLICE; } else { h->i_frame_type = AVS2_GB_SLICE; } } } h->i_coi = u_v(bs, 8, "coding_order"); if (h->seq_info.b_temporal_id_exist == 1) { h->i_cur_layer = u_v(bs, TEMPORAL_MAXLEVEL_BIT, "temporal_id"); } if (h->seq_info.head.low_delay == 0) { h->i_display_delay = ue_v(bs, "picture_output_delay"); if (h->i_display_delay >= 64) { davs2_log(h, DAVS2_LOG_ERROR, "invalid picture output delay intra."); return -1; } } predict = u_v(bs, 1, "use RCS in SPS"); if (predict) { int index = u_v(bs, 5, "predict for RCS"); if (index >= h->seq_info.num_of_rps) { davs2_log(h, DAVS2_LOG_ERROR, "invalid rps index."); return -1; } h->rps = h->seq_info.seq_rps[index]; } else { h->rps.refered_by_others = u_v(bs, 1, "refered by others"); h->rps.num_of_ref = u_v(bs, 3, "num of reference picture"); if (h->rps.num_of_ref > AVS2_MAX_REFS) { davs2_log(h, DAVS2_LOG_ERROR, "invalid number of references."); return -1; } for (i = 0; i < h->rps.num_of_ref; i++) { h->rps.ref_pic[i] = u_v(bs, 6, "delta COI of ref pic"); } h->rps.num_to_remove = u_v(bs, 3, "num of removed picture"); assert(h->rps.num_to_remove <= sizeof(h->rps.remove_pic) / sizeof(h->rps.remove_pic[0])); for (i = 0; i < h->rps.num_to_remove; i++) { h->rps.remove_pic[i] = u_v(bs, 6, "delta COI of removed pic"); } u_v(bs, 1, "marker bit"); } if (h->seq_info.head.low_delay) { /* bbv_check_times = */ ue_v(bs, "bbv check times"); } progressive_frame = u_v(bs, 1, "progressive_frame"); if (!progressive_frame) { h->i_pic_coding_type = (int8_t)u_v(bs, 1, "picture_structure"); } else { h->i_pic_coding_type = FRAME; } h->b_top_field_first = u_flag(bs, "top_field_first"); h->b_repeat_first_field = u_flag(bs, "repeat_first_field"); if (h->seq_info.b_field_coding) { h->b_top_field = u_flag(bs, "is_top_field"); /* reserved = */ u_v(bs, 1, "reserved bit for interlace coding"); } h->b_fixed_picture_qp = u_flag(bs, "fixed_picture_qp"); h->i_picture_qp = u_v(bs, 7, "picture_qp"); h->b_loop_filter = u_v(bs, 1, "loop_filter_disable") ^ 0x01; if (h->b_loop_filter) { int loop_filter_parameter_flag = u_v(bs, 1, "loop_filter_parameter_flag"); if (loop_filter_parameter_flag) { h->i_alpha_offset = se_v(bs, "alpha_offset"); h->i_beta_offset = se_v(bs, "beta_offset"); } else { h->i_alpha_offset = 0; h->i_beta_offset = 0; } deblock_init_frame_parames(h); } h->enable_chroma_quant_param = !u_flag(bs, "chroma_quant_param_disable"); if (h->enable_chroma_quant_param) { h->chroma_quant_param_delta_u = se_v(bs, "chroma_quant_param_delta_cb"); h->chroma_quant_param_delta_v = se_v(bs, "chroma_quant_param_delta_cr"); } else { h->chroma_quant_param_delta_u = 0; h->chroma_quant_param_delta_v = 0; } // adaptive frequency weighting quantization h->seq_info.enable_weighted_quant = 0; if (h->seq_info.enable_weighted_quant) { int pic_weight_quant_enable = u_v(bs, 1, "pic_weight_quant_enable"); if (pic_weight_quant_enable) { weighted_quant_t *p = &h->wq; p->pic_wq_data_index = u_v(bs, 2, "pic_wq_data_index"); if (p->pic_wq_data_index == 1) { /* int mb_adapt_wq_disable = */ u_v(bs, 1, "reserved_bits"); p->wq_param = u_v(bs, 2, "weighting_quant_param_index"); p->wq_model = u_v(bs, 2, "wq_model"); if (p->wq_param == 1) { for (i = 0; i < 6; i++) { p->quant_param_undetail[i] = (int16_t)se_v(bs, "quant_param_delta_u") + wq_param_default[UNDETAILED][i]; } } if (p->wq_param == 2) { for (i = 0; i < 6; i++) { p->quant_param_detail[i] = (int16_t)se_v(bs, "quant_param_delta_d") + wq_param_default[DETAILED][i]; } } } else if (p->pic_wq_data_index == 2) { int x, y, sizeId, uiWqMSize; for (sizeId = 0; sizeId < 2; sizeId++) { i = 0; uiWqMSize = DAVS2_MIN(1 << (sizeId + 2), 8); for (y = 0; y < uiWqMSize; y++) { for (x = 0; x < uiWqMSize; x++) { p->pic_user_wq_matrix[sizeId][i++] = (int16_t)ue_v(bs, "weight_quant_coeff"); } } } } h->seq_info.enable_weighted_quant = 1; } } alf_read_param(h, bs); h->i_qp = h->i_picture_qp; if (!is_valid_qp(h, h->i_qp)) { davs2_log(h, DAVS2_LOG_ERROR, "Invalid I Picture QP: %d\n", h->i_qp); } /* align position in bitstream buffer */ bs_align(bs); return 0; } /* --------------------------------------------------------------------------- * Inter picture header */ static int parse_picture_header_inter(davs2_t *h, davs2_bs_t *bs) { int background_pred_flag; int progressive_frame; int predict; int i; /* skip start code */ bs->i_bit_pos += 32; u_v(bs, 32, "bbv delay"); h->i_pic_struct = (int8_t)u_v(bs, 2, "picture_coding_type"); if (h->b_bkgnd_picture && (h->i_pic_struct == 1 || h->i_pic_struct == 3)) { if (h->i_pic_struct == 1) { background_pred_flag = u_v(bs, 1, "background_pred_flag"); } else { background_pred_flag = 0; } if (background_pred_flag == 0) { h->b_bkgnd_reference = u_flag(bs, "background_reference_enable"); } else { h->b_bkgnd_reference = 0; } } else { background_pred_flag = 0; h->b_bkgnd_reference = 0; } if (h->i_pic_struct == 1 && background_pred_flag) { h->i_frame_type = AVS2_S_SLICE; } else if (h->i_pic_struct == 1) { h->i_frame_type = AVS2_P_SLICE; } else if (h->i_pic_struct == 3) { h->i_frame_type = AVS2_F_SLICE; } else { h->i_frame_type = AVS2_B_SLICE; } h->i_coi = u_v(bs, 8, "coding_order"); if (h->seq_info.b_temporal_id_exist == 1) { h->i_cur_layer = u_v(bs, TEMPORAL_MAXLEVEL_BIT, "temporal_id"); } if (h->seq_info.head.low_delay == 0) { h->i_display_delay = ue_v(bs, "displaydelay"); if (h->i_display_delay >= 64) { davs2_log(h, DAVS2_LOG_ERROR, "invalid picture output delay inter."); return -1; } } /* */ predict = u_v(bs, 1, "use RPS in SPS"); if (predict) { int index = u_v(bs, 5, "predict for RPS"); if (index >= h->seq_info.num_of_rps) { davs2_log(h, DAVS2_LOG_ERROR, "invalid rps index."); return -1; } h->rps = h->seq_info.seq_rps[index]; } else { // GOP size h->rps.refered_by_others = u_v(bs, 1, "refered by others"); h->rps.num_of_ref = u_v(bs, 3, "num of reference picture"); for (i = 0; i < h->rps.num_of_ref; i++) { h->rps.ref_pic[i] = u_v(bs, 6, "delta COI of ref pic"); } h->rps.num_to_remove = u_v(bs, 3, "num of removed picture"); assert(h->rps.num_to_remove <= sizeof(h->rps.remove_pic) / sizeof(h->rps.remove_pic[0])); for (i = 0; i < h->rps.num_to_remove; i++) { h->rps.remove_pic[i] = u_v(bs, 6, "delta COI of removed pic"); } u_v(bs, 1, "marker bit"); } if (h->seq_info.head.low_delay) { ue_v(bs, "bbv check times"); } progressive_frame = u_v(bs, 1, "progressive_frame"); if (!progressive_frame) { h->i_pic_coding_type = (int8_t)u_v(bs, 1, "picture_structure"); } else { h->i_pic_coding_type = FRAME; } h->b_top_field_first = u_flag(bs, "top_field_first"); h->b_repeat_first_field = u_flag(bs, "repeat_first_field"); if (h->seq_info.b_field_coding) { h->b_top_field =u_flag(bs, "is_top_field"); u_v(bs, 1, "reserved bit for interlace coding"); } h->b_fixed_picture_qp = u_flag(bs, "fixed_picture_qp"); h->i_picture_qp = u_v(bs, 7, "picture_qp"); if (!(h->i_pic_struct == 2 && h->i_pic_coding_type == FRAME)) { u_v(bs, 1, "reserved_bit"); } h->b_ra_decodable = u_flag(bs, "random_access_decodable_flag"); h->b_loop_filter = u_v(bs, 1, "loop_filter_disable") ^ 0x01; if (h->b_loop_filter) { int loop_filter_parameter_flag = u_v(bs, 1, "loop_filter_parameter_flag"); if (loop_filter_parameter_flag) { h->i_alpha_offset = se_v(bs, "alpha_offset"); h->i_beta_offset = se_v(bs, "beta_offset"); } else { h->i_alpha_offset = 0; h->i_beta_offset = 0; } deblock_init_frame_parames(h); } h->enable_chroma_quant_param = !u_flag(bs, "chroma_quant_param_disable"); if (h->enable_chroma_quant_param) { h->chroma_quant_param_delta_u = se_v(bs, "chroma_quant_param_delta_cb"); h->chroma_quant_param_delta_v = se_v(bs, "chroma_quant_param_delta_cr"); } else { h->chroma_quant_param_delta_u = 0; h->chroma_quant_param_delta_v = 0; } // adaptive frequency weighting quantization h->seq_info.enable_weighted_quant = 0; if (h->seq_info.enable_weighted_quant) { int pic_weight_quant_enable = u_v(bs, 1, "pic_weight_quant_enable"); if (pic_weight_quant_enable) { weighted_quant_t *p = &h->wq; p->pic_wq_data_index = u_v(bs, 2, "pic_wq_data_index"); if (p->pic_wq_data_index == 1) { /* int mb_adapt_wq_disable = */ u_v(bs, 1, "reserved_bits"); p->wq_param = u_v(bs, 2, "weighting_quant_param_index"); p->wq_model = u_v(bs, 2, "wq_model"); if (p->wq_param == 1) { for (i = 0; i < 6; i++) { p->quant_param_undetail[i] = (int16_t)se_v(bs, "quant_param_delta_u") + wq_param_default[UNDETAILED][i]; } } if (p->wq_param == 2) { for (i = 0; i < 6; i++) { p->quant_param_detail[i] = (int16_t)se_v(bs, "quant_param_delta_d") + wq_param_default[DETAILED][i]; } } } else if (p->pic_wq_data_index == 2) { int x, y, sizeId, uiWqMSize; for (sizeId = 0; sizeId < 2; sizeId++) { i = 0; uiWqMSize = DAVS2_MIN(1 << (sizeId + 2), 8); for (y = 0; y < uiWqMSize; y++) { for (x = 0; x < uiWqMSize; x++) { p->pic_user_wq_matrix[sizeId][i++] = (int16_t)ue_v(bs, "weight_quant_coeff"); } } } } h->seq_info.enable_weighted_quant = 1; } } alf_read_param(h, bs); h->i_qp = h->i_picture_qp; if (!is_valid_qp(h, h->i_qp)) { davs2_log(h, DAVS2_LOG_ERROR, "Invalid PB Picture QP: %d\n", h->i_qp); } /* align position in bitstream buffer */ bs_align(bs); return 0; } /* --------------------------------------------------------------------------- */ static int parse_picture_header(davs2_t *h, davs2_bs_t *bs, uint32_t start_code) { davs2_mgr_t *mgr = h->task_info.taskmgr; assert(start_code == SC_INTRA_PICTURE || start_code == SC_INTER_PICTURE); if (start_code == SC_INTRA_PICTURE) { if (parse_picture_header_intra(h, bs) < 0) { return -1; } } else { if (mgr->outpics.output == -1) { /* An I frame is expected for the first frame or after the decoder is flushed. */ davs2_log(h, DAVS2_LOG_ERROR, "sequence should start with an I frame."); return -1; } if (parse_picture_header_inter(h, bs) < 0) { return -1; } } /* field picture ? */ if (h->i_pic_coding_type != FRAME) { davs2_log(h, DAVS2_LOG_ERROR, "field is not supported."); return -1; } /* COI should be a periodically-repeated value from 0 to 255 */ if (mgr->outpics.output != -1 && h->i_coi != (mgr->i_prev_coi + 1) % AVS2_COI_CYCLE) { davs2_log(h, DAVS2_LOG_DEBUG, "discontinuous COI (prev: %d --> curr: %d).", mgr->i_prev_coi, h->i_coi); } /* update COI */ if (h->i_coi < mgr->i_prev_coi) { /// !!! '=' mgr->i_tr_wrap_cnt++; } mgr->i_prev_coi = h->i_coi; h->i_coi += mgr->i_tr_wrap_cnt * AVS2_COI_CYCLE; if (h->seq_info.head.low_delay == 0) { h->i_poc = h->i_coi + h->i_display_delay - h->seq_info.picture_reorder_delay; } else { h->i_poc = h->i_coi; } assert(h->i_coi >= 0 && h->i_poc >= 0); /// 'int' (2147483647) should be large enough for 'i_coi' & 'i_poc'. if (mgr->outpics.output == -1 && start_code == SC_INTRA_PICTURE) { if (h->i_coi != 0) { davs2_log(h, DAVS2_LOG_INFO, "COI of the first frame is %d.", h->i_coi); } mgr->outpics.output = h->i_poc; } return 0; } /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ void parse_slice_header(davs2_t *h, davs2_bs_t *bs) { int slice_vertical_position; int slice_vertical_position_extension = 0; int slice_horizontal_positon; int slice_horizontal_positon_extension; int mb_row; /* skip start code: 00 00 01 */ bs->i_bit_pos += 24; slice_vertical_position = u_v(bs, 8, "slice vertical position"); if (h->i_image_height > (144 * h->i_lcu_size)) { slice_vertical_position_extension = u_v(bs, 3, "slice vertical position extension"); } if (h->i_image_height > (144 * h->i_lcu_size)) { mb_row = (slice_vertical_position_extension << 7) + slice_vertical_position; } else { mb_row = slice_vertical_position; } slice_horizontal_positon = u_v(bs, 8, "slice horizontal position"); if (h->i_width > (255 * h->i_lcu_size)) { slice_horizontal_positon_extension = u_v(bs, 2, "slice horizontal position extension"); } if (!h->b_fixed_picture_qp) { h->b_fixed_slice_qp = u_flag(bs, "fixed_slice_qp"); h->i_slice_qp = u_v(bs, 7, "slice_qp"); h->b_DQP = !h->b_fixed_slice_qp; } else { h->i_slice_qp = h->i_picture_qp; h->b_DQP = 0; } h->i_qp = h->i_slice_qp; if (!is_valid_qp(h, h->i_qp)) { davs2_log(h, DAVS2_LOG_ERROR, "Invalid Slice QP: %d\n", h->i_qp); } if (h->b_sao) { h->slice_sao_on[0] = u_flag(bs, "sao_slice_flag_Y"); h->slice_sao_on[1] = u_flag(bs, "sao_slice_flag_Cb"); h->slice_sao_on[2] = u_flag(bs, "sao_slice_flag_Cr"); } } /* --------------------------------------------------------------------------- */ davs2_outpic_t *alloc_picture(int w, int h) { davs2_outpic_t *pic = NULL; uint8_t *buf; buf = (uint8_t *)davs2_malloc(sizeof(davs2_outpic_t) + sizeof(davs2_seq_info_t) + sizeof(davs2_picture_t) + sizeof(pel_t) * w * h * 3 / 2); if (buf == NULL) { return NULL; } pic = (davs2_outpic_t *)buf; buf += sizeof(davs2_outpic_t); /* davs2_outpic_t */ pic->frame = NULL; pic->next = NULL; pic->head = (davs2_seq_info_t *)buf; buf += sizeof(davs2_seq_info_t); pic->pic = (davs2_picture_t *)buf; buf += sizeof(davs2_picture_t); pic->pic->num_planes = 3; pic->pic->planes[0] = buf; pic->pic->planes[1] = pic->pic->planes[0] + w * h * sizeof(pel_t); pic->pic->planes[2] = pic->pic->planes[1] + w * h / 4 * sizeof(pel_t); pic->pic->widths[0] = w; pic->pic->widths[1] = w / 2; pic->pic->widths[2] = w / 2; pic->pic->lines [0] = h; pic->pic->lines [1] = h / 2; pic->pic->lines [2] = h / 2; pic->pic->dec_frame = NULL; return pic; } /* --------------------------------------------------------------------------- */ void free_picture(davs2_outpic_t *pic) { if (pic) { davs2_free(pic); } } /* --------------------------------------------------------------------------- * destroy decoding picture buffer(DPB) */ void destroy_dpb(davs2_mgr_t *mgr) { davs2_frame_t *frame = NULL; int i; for (i = 0; i < mgr->dpbsize; i++) { frame = mgr->dpb[i]; assert(frame); mgr->dpb[i] = NULL; davs2_thread_mutex_lock(&frame->mutex_frm); if (frame->i_ref_count == 0) { davs2_thread_mutex_unlock(&frame->mutex_frm); davs2_frame_destroy(frame); } else { frame->i_disposable = 2; /* free when not referenced */ davs2_thread_mutex_unlock(&frame->mutex_frm); } } davs2_free(mgr->dpb); mgr->dpb = NULL; } /* --------------------------------------------------------------------------- * create decoding picture buffer(DPB) */ static INLINE int create_dpb(davs2_mgr_t *mgr) { davs2_seq_t *seq = &mgr->seq_info; uint8_t *mem_ptr = NULL; size_t mem_size = 0; int i; mgr->dpbsize = mgr->num_decoders + seq->picture_reorder_delay + 16; /// !!! FIXME: decide dpb buffer size ? mgr->dpbsize += 8; // FIXME: Ҫ mem_size = mgr->dpbsize * sizeof(davs2_frame_t *) + davs2_frame_get_size(seq->i_enc_width, seq->i_enc_height, seq->head.chroma_format, 1) * mgr->dpbsize + davs2_frame_get_size(seq->i_enc_width, seq->i_enc_height, seq->head.chroma_format, 0) + CACHE_LINE_SIZE * (mgr->dpbsize + 2); mem_ptr = (uint8_t *)davs2_malloc(mem_size); if (mem_ptr == NULL) { return -1; } mgr->dpb = (davs2_frame_t **)mem_ptr; mem_ptr += mgr->dpbsize * sizeof(davs2_frame_t *); ALIGN_POINTER(mem_ptr); for (i = 0; i < mgr->dpbsize; i++) { mgr->dpb[i] = davs2_frame_new(seq->i_enc_width, seq->i_enc_height, seq->head.chroma_format, &mem_ptr, 1); ALIGN_POINTER(mem_ptr); if (mgr->dpb[i] == NULL) { return -1; } } return 0; } /* --------------------------------------------------------------------------- */ static void init_fdec(davs2_t *h, int64_t pts, int64_t dts) { int num_in_spu = h->i_width_in_spu * h->i_height_in_spu; int i; h->fdec->i_type = h->i_frame_type; h->fdec->i_qp = h->i_qp; h->fdec->i_poc = h->i_poc; h->fdec->i_coi = h->i_coi; h->fdec->b_refered_by_others = h->rps.refered_by_others; h->fdec->i_decoded_line = -1; h->fdec->i_pts = pts; h->fdec->i_dts = dts; for (i = 0; i < AVS2_MAX_REFS; i++) { h->fdec->dist_refs[i] = -1; h->fdec->dist_scale_refs[i] = -1; } if (h->i_frame_type != AVS2_B_SLICE) { for (i = 0; i < h->num_of_references; i++) { h->fdec->dist_refs[i] = AVS2_DISTANCE_INDEX(2 * (h->fdec->i_poc - h->fref[i]->i_poc)); if (h->fdec->dist_refs[i] <= 0) { davs2_log(h, DAVS2_LOG_ERROR, "invalid reference frame distance."); h->fdec->dist_refs[i] = 1; } h->fdec->dist_scale_refs[i] = (MULTI / h->fdec->dist_refs[i]); } } else { h->fdec->dist_refs[B_FWD] = AVS2_DISTANCE_INDEX(2 * (h->fdec->i_poc - h->fref[B_FWD]->i_poc)); h->fdec->dist_refs[B_BWD] = AVS2_DISTANCE_INDEX(2 * (h->fref[B_BWD]->i_poc - h->fdec->i_poc)); if (h->fdec->dist_refs[B_FWD] <= 0) { davs2_log(h, DAVS2_LOG_ERROR, "invalid reference frame distance. B_FWD"); h->fdec->dist_refs[B_FWD] = 1; } if (h->fdec->dist_refs[B_BWD] <= 0) { davs2_log(h, DAVS2_LOG_ERROR, "invalid reference frame distance. B_BWD"); h->fdec->dist_refs[B_BWD] = 1; } h->fdec->dist_scale_refs[B_FWD] = (MULTI / h->fdec->dist_refs[B_FWD]); h->fdec->dist_scale_refs[B_BWD] = (MULTI / h->fdec->dist_refs[B_BWD]); } /* clear mvbuf and refbuf */ memset(h->fdec->mvbuf, 0, num_in_spu * sizeof(mv_t)); memset(h->fdec->refbuf, INVALID_REF, num_in_spu * sizeof(int8_t)); } /* --------------------------------------------------------------------------- */ int task_decoder_update(davs2_t *h) { davs2_mgr_t *mgr = h->task_info.taskmgr; davs2_seq_t *seq = &mgr->seq_info; if (seq->valid_flag == 0) { davs2_log(h, DAVS2_LOG_ERROR, "failed to update decoder (invalid sequence header)."); return -1; } if (h->b_sao != seq->enable_sao || h->b_alf != seq->enable_alf || h->i_chroma_format != (int)seq->head.chroma_format || h->i_lcu_level != seq->log2_lcu_size || h->i_image_width != (int)seq->head.width || h->i_image_height != (int)seq->head.height || h->p_integral == NULL) { /* resolution changed */ decoder_free_extra_buffer(h); /* key properties of the video sequence: size and color format */ h->i_lcu_level = seq->log2_lcu_size; h->i_lcu_size = 1 << h->i_lcu_level; h->i_lcu_size_sub1 = (1 << h->i_lcu_level) - 1; h->i_chroma_format = seq->head.chroma_format; h->i_image_width = seq->head.width; h->i_image_height = seq->head.height; h->i_width = seq->i_enc_width; h->i_height = seq->i_enc_height; h->i_width_in_scu = h->i_width >> MIN_CU_SIZE_IN_BIT; h->i_height_in_scu = h->i_height >> MIN_CU_SIZE_IN_BIT; h->i_size_in_scu = h->i_width_in_scu * h->i_height_in_scu; h->i_width_in_spu = h->i_width >> MIN_PU_SIZE_IN_BIT; h->i_height_in_spu = h->i_height >> MIN_PU_SIZE_IN_BIT; h->i_width_in_lcu = (h->i_width + h->i_lcu_size_sub1) >> h->i_lcu_level; h->i_height_in_lcu = (h->i_height + h->i_lcu_size_sub1) >> h->i_lcu_level; /* encoding tools configuration */ h->b_sao = seq->enable_sao; h->b_alf = seq->enable_alf; if (decoder_alloc_extra_buffer(h) < 0) { h->i_lcu_level = 0; h->i_chroma_format = 0; h->i_image_width = 0; h->i_image_height = 0; davs2_log(h, DAVS2_LOG_ERROR, "failed to update the decoder(failed to alloc space)."); return -1; } } /* update sequence header */ h->i_chroma_format = seq->head.chroma_format; h->i_lcu_level = seq->log2_lcu_size; h->b_bkgnd_picture = seq->enable_background_picture; // h->b_dmh = 1; h->output_bit_depth = 8; h->sample_bit_depth = 8; h->p_tab_DL_avail = tab_DL_Avails[h->i_lcu_level]; h->p_tab_TR_avail = tab_TR_Avails[h->i_lcu_level]; if (seq->head.profile_id == MAIN10_PROFILE) { h->output_bit_depth = 6 + (seq->sample_precision << 1); h->sample_bit_depth = 6 + (seq->encoding_precision << 1); } #if HIGH_BIT_DEPTH g_bit_depth = h->sample_bit_depth; max_pel_value = (1 << g_bit_depth) - 1; g_dc_value = 1 << (g_bit_depth - 1); #else if (g_bit_depth != h->sample_bit_depth) { davs2_log(h, DAVS2_LOG_ERROR, "Un-supported bit-depth %d in this version.\n", h->sample_bit_depth); return -1; } #endif memcpy(h->wq.seq_wq_matrix, seq->seq_wq_matrix, 2 * 64 * sizeof(int16_t)); /* weighting quantization matrix */ memcpy(&h->seq_info, seq, sizeof(davs2_seq_t)); return 0; } /* --------------------------------------------------------------------------- */ static int task_set_sequence_head(davs2_mgr_t *mgr, davs2_seq_t *seq) { int ret = 0; davs2_thread_mutex_lock(&mgr->mutex_mgr); davs2_reconfigure_decoder(mgr); if (seq->valid_flag) { int newres = (mgr->seq_info.head.height != seq->head.height || mgr->seq_info.head.width != seq->head.width); memcpy(&mgr->seq_info, seq, sizeof(davs2_seq_t)); if (newres) { /* resolution changed : new sequence */ davs2_log(mgr, DAVS2_LOG_INFO, "Sequence Resolution: %dx%d.", seq->head.width, seq->head.height); if ((seq->head.width & 0) != 0 || (seq->head.height & 1) != 0) { davs2_log(mgr, DAVS2_LOG_ERROR, "Sequence Resolution %dx%d is not even\n", seq->head.width, seq->head.height); } /* COI for the new sequence should be reset */ mgr->i_tr_wrap_cnt = 0; mgr->i_prev_coi = -1; destroy_dpb(mgr); if (create_dpb(mgr) < 0) { /* error */ ret = -1; memset(&mgr->seq_info, 0, sizeof(davs2_seq_t)); davs2_log(mgr, DAVS2_LOG_ERROR, "failed to create dpb buffers. %dx%d.", seq->head.width, seq->head.height); } mgr->new_sps = TRUE; } } else { /* invalid header */ memset(&mgr->seq_info, 0, sizeof(davs2_seq_t)); davs2_log(mgr, DAVS2_LOG_ERROR, "decoded an invalid sequence header: %dx%d.", seq->head.width, seq->head.height); } davs2_thread_mutex_unlock(&mgr->mutex_mgr); return ret; } /* --------------------------------------------------------------------------- */ void clean_one_frame(davs2_frame_t *frame) { frame->i_poc = INVALID_FRAME; frame->i_coi = INVALID_FRAME; frame->i_disposable = 0; frame->b_refered_by_others = 0; } /* --------------------------------------------------------------------------- */ void release_one_frame(davs2_frame_t *frame) { int obsolete = 0; if (frame == NULL) { return; } davs2_thread_mutex_lock(&frame->mutex_frm); assert(frame->i_ref_count > 0); frame->i_ref_count--; if (frame->i_ref_count == 0) { if (frame->i_disposable == 1) { clean_one_frame(frame); } obsolete = frame->i_disposable == 2; } davs2_thread_mutex_unlock(&frame->mutex_frm); if (obsolete != 0) { davs2_frame_destroy(frame); } } /* --------------------------------------------------------------------------- */ void task_release_frames(davs2_t *h) { int i; /* release reference to all reference frames */ for (i = 0; i < h->num_of_references; i++) { release_one_frame(h->fref[i]); h->fref[i] = NULL; } h->num_of_references = 0; /* release reference to the reconstructed frame */ release_one_frame(h->fdec); h->fdec = NULL; } /* --------------------------------------------------------------------------- */ int has_blocking(davs2_mgr_t *mgr) { davs2_output_t *pics = &mgr->outpics; davs2_outpic_t *pic = NULL; davs2_frame_t *frame = NULL; int decodingframes = 0, outputframes = 0; int i; /* is the expected frame already in the output list ? */ for (pic = pics->pics; pic; pic = pic->next) { frame = pic->frame; if (frame->i_poc == pics->output) { /* the expected frame */ return 0; } else if (frame->i_poc < pics->output) { /* a late frame: the output thread will dump it.*/ return 0; } outputframes++; } /* is the expected frame still under decoding ? */ for (i = 0; i < mgr->num_decoders; i++) { davs2_t *h = &mgr->decoders[i]; if (h->task_info.task_status != TASK_FREE) { frame = h->fdec; if (frame != NULL) { if (frame->i_poc == pics->output) { /* the expected frame will be put into the output list soon */ return 0; } if (frame->i_poc >= 0) { decodingframes++; } } } } assert(outputframes + decodingframes <= mgr->dpbsize); /* the expected frame is neither in the output list nor under decoding */ if (mgr->outpics.busy != 0) { /* one frame being delivered, soon it maybe free ? */ return 0; } return 1; } /* --------------------------------------------------------------------------- */ int task_get_references(davs2_t *h, int64_t pts, int64_t dts) { davs2_mgr_t *mgr = h->task_info.taskmgr; davs2_frame_t **dpb = mgr->dpb; davs2_frame_t *frame = NULL; int i, j; #define IS_VALID_FRAME(frame) ((frame)->i_coi != INVALID_FRAME && (frame)->i_poc != INVALID_FRAME) davs2_thread_mutex_lock(&mgr->mutex_mgr); h->fdec = NULL; h->num_of_references = 0; for (i = 0; i < AVS2_MAX_REFS; i++) { h->fref[i] = NULL; } for (i = 0; i < mgr->num_frames_to_remove; i++) { int coi_frame_to_remove = mgr->coi_remove_frame[i]; for (j = 0; j < mgr->dpbsize; j++) { frame = dpb[j]; if (!IS_VALID_FRAME(frame)) { continue; } if (frame->i_coi == coi_frame_to_remove) { break; } } if (j < mgr->dpbsize) { davs2_thread_mutex_lock(&frame->mutex_frm); // assert(frame->i_disposable == 0); if (frame->i_ref_count == 0) { clean_one_frame(frame); } else { frame->i_disposable = 1; } davs2_thread_mutex_unlock(&frame->mutex_frm); } } if (h->i_frame_type == AVS2_GB_SLICE) { h->fdec = h->f_background_cur; } else { for (i = 0; i < h->rps.num_of_ref; i++) { int ref_frame_coi = h->i_coi - h->rps.ref_pic[i]; for (j = 0; j < mgr->dpbsize; j++) { frame = dpb[j]; if (!IS_VALID_FRAME(frame)) { continue; } davs2_thread_mutex_lock(&frame->mutex_frm); if (frame->i_coi >= 0 && ref_frame_coi == frame->i_coi) { assert(frame->i_disposable == 0); assert(frame->b_refered_by_others != 0); if (frame->i_disposable == 0 && frame->b_refered_by_others != 0) { frame->i_ref_count++; davs2_thread_mutex_unlock(&frame->mutex_frm); h->fref[i] = frame; h->num_of_references++; break; } } davs2_thread_mutex_unlock(&frame->mutex_frm); } if (j == mgr->dpbsize) { davs2_log(h, DAVS2_LOG_ERROR, "reference frame of [coi: %d, poc: %d]: not found.", h->i_coi, h->i_poc, ref_frame_coi); goto fail; } } if (h->i_frame_type == AVS2_B_SLICE && (h->num_of_references != 2 || h->fref[0]->i_poc <= h->i_poc || h->fref[1]->i_poc >= h->i_poc)) { davs2_log(h, DAVS2_LOG_ERROR, "reference frames for B frame [coi: %d, poc: %d] are wrong: %d frames found", h->i_coi, h->i_poc, h->num_of_references); goto fail; } /* delete the frame that will never be used */ mgr->num_frames_to_remove = h->rps.num_to_remove; for (i = 0; i < h->rps.num_to_remove; i++) { mgr->coi_remove_frame[i] = h->i_coi - h->rps.remove_pic[i]; } /* clean old frames */ for (i = 0; i < mgr->dpbsize; i++) { frame = dpb[i]; if (!IS_VALID_FRAME(frame)) { continue; } davs2_thread_mutex_lock(&frame->mutex_frm); if (DAVS2_ABS(frame->i_poc - h->i_poc) >= MAX_POC_DISTANCE) { if (frame->i_ref_count == 0) { davs2_log(h, DAVS2_LOG_WARNING, "force to remove obsolete frame .", frame->i_poc); /* no one is holding reference to this frame: clean it ! */ clean_one_frame(frame); } else { /* weird ? */ /* some task has forgot to release it ? */ if (frame->i_disposable == 0) { frame->i_disposable = 1; davs2_log(h, DAVS2_LOG_WARNING, "force to mark obsolete frame as to be removed.", frame->i_poc); } } } davs2_thread_mutex_unlock(&frame->mutex_frm); } /* find fdec */ for (;;) { for (i = 0; i < mgr->dpbsize; i++) { frame = dpb[i]; davs2_thread_mutex_lock(&frame->mutex_frm); if (frame->i_ref_count == 0 && frame->b_refered_by_others == 0) { assert(frame->i_disposable == 0); frame->i_ref_count++; /* for the decoding thread */ frame->i_ref_count++; /* for the output thread */ frame->i_disposable = h->rps.refered_by_others == 0 ? 1 : 0; h->fdec = frame; davs2_thread_mutex_unlock(&frame->mutex_frm); break; } davs2_thread_mutex_unlock(&frame->mutex_frm); } if (h->fdec != NULL) { /* got it */ break; } /* DPB full ? */ if (open_dbp_buffer_warning) { davs2_log(h, DAVS2_LOG_WARNING, "running out of DPB buffers, performance may suffer."); open_dbp_buffer_warning = 0; /* avoid too many warnings */ } /* detect possible blocks */ if (has_blocking(mgr) != 0) { if (mgr->outpics.pics == NULL) { /*!!! try to use an earliest frame ??? */ /* find the frame with the least POC value */ for (i = 0; i < mgr->dpbsize; i++) { frame = dpb[i]; davs2_thread_mutex_lock(&frame->mutex_frm); if (frame->i_ref_count == 0 && (h->fdec == NULL || h->fdec->i_poc > frame->i_poc)) { if (h->fdec) { davs2_thread_mutex_lock(&h->fdec->mutex_frm); h->fdec->i_ref_count--; h->fdec->i_ref_count--; davs2_thread_mutex_unlock(&h->fdec->mutex_frm); } frame->i_ref_count++; /* for the decoding thread */ frame->i_ref_count++; /* for the output thread */ h->fdec = frame; } davs2_thread_mutex_unlock(&frame->mutex_frm); } if (NULL == h->fdec) { davs2_log(h, DAVS2_LOG_ERROR, "no frame for new task, DPB size (%d) too small(reorder delay: %d) ?", mgr->dpbsize, mgr->seq_info.picture_reorder_delay); goto fail; } h->fdec->i_disposable = h->rps.refered_by_others == 0 ? 1 : 0; davs2_log(h, DAVS2_LOG_WARNING, "force one frame as the reconstruction frame."); break; } else { /* next frame will not be available, skip it */ assert(mgr->outpics.output < mgr->outpics.pics->frame->i_poc); /* emit an error */ davs2_log(h, DAVS2_LOG_ERROR, "the expected frame %d unavailable, proceed to frame %d.", mgr->outpics.output, mgr->outpics.pics->frame->i_poc); /* output the next available frame */ mgr->outpics.output = mgr->outpics.pics->frame->i_poc; } } davs2_thread_mutex_unlock(&mgr->mutex_mgr); /* wait for the output thread to release some frames */ davs2_sleep_ms(1); /* check it again */ davs2_thread_mutex_lock(&mgr->mutex_mgr); } init_fdec(h, pts, dts); if (h->i_frame_type == AVS2_S_SLICE) { int num_in_spu = h->i_width_in_spu * h->i_height_in_spu; for (i = 0; i < mgr->dpbsize; i++) { memset(dpb[i]->mvbuf, 0, num_in_spu * sizeof(mv_t)); memset(dpb[i]->refbuf, 0, num_in_spu * sizeof(int8_t)); } } } davs2_thread_mutex_unlock(&mgr->mutex_mgr); return 0; fail: davs2_log(NULL, DAVS2_LOG_ERROR, "Failed to decode frame \n", h->i_coi, h->i_poc); davs2_thread_mutex_unlock(&mgr->mutex_mgr); task_release_frames(h); return -1; } /* --------------------------------------------------------------------------- */ int parse_header(davs2_t *h, davs2_bs_t *p_bs) { const uint8_t *data = p_bs->p_stream; int *bitpos = &p_bs->i_bit_pos; int len = p_bs->i_stream; const uint8_t *p_start_code = 0; if (len <= 4) { return -1; // at least 4 bytes are needed for decoding } while ((p_start_code = find_start_code(data + (*bitpos >> 3), len - (*bitpos >> 3))) != 0) { uint32_t start_code; *bitpos = (int)((p_start_code - data) << 3); if ((*bitpos >> 3) + 4 > len) { break; } start_code = data[(*bitpos >> 3) + 3]; switch (start_code) { case SC_INTRA_PICTURE: case SC_INTER_PICTURE: /* update the decoder */ if (task_decoder_update(h) < 0) { return -1; } /* decode the picture header */ if (parse_picture_header(h, p_bs, start_code) < 0) { return -1; } return 0; /// !!! we only decode one frame for a single call. case SC_SEQUENCE_HEADER: davs2_seq_t new_seq; /* decode the sequence head */ if (parse_sequence_header(h->task_info.taskmgr, &new_seq, p_bs) < 0) { davs2_log(h, NULL, "Invalid sequence header."); return -1; } /* update the task manager */ if (task_set_sequence_head(h->task_info.taskmgr, &new_seq) < 0) { return -1; } break; case SC_EXTENSION: case SC_USER_DATA: case SC_SEQUENCE_END: case SC_VIDEO_EDIT_CODE: default: /* skip this unit */ /* NOTE: if you want to decode these units, you should avoid */ /* using a davs2_t structure which will not be updated until a picture header is decoded. */ *bitpos += 32; break; } } return 1; } davs2-1.6/source/common/header.h000066400000000000000000000040721337322544400166020ustar00rootroot00000000000000/* * header.h * * Description of this file: * Header functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_HEADER_H #define DAVS2_HEADER_H #ifdef __cplusplus extern "C" { #endif #define parse_slice_header FPFX(parse_slice_header) void parse_slice_header(davs2_t *h, davs2_bs_t *bs); #define parse_header FPFX(parse_header) int parse_header(davs2_t *h, davs2_bs_t *p_bs); #define release_one_frame FPFX(release_one_frame) void release_one_frame(davs2_frame_t *frame); #define task_release_frames FPFX(task_release_frames) void task_release_frames(davs2_t *h); #define alloc_picture FPFX(alloc_picture) davs2_outpic_t *alloc_picture(int w, int h); #define free_picture FPFX(free_picture) void free_picture(davs2_outpic_t *pic); #define destroy_dpb FPFX(destroy_dpb) void destroy_dpb(davs2_mgr_t *mgr); #ifdef __cplusplus } #endif #endif // DAVS2_HEADER_H davs2-1.6/source/common/intra.cc000066400000000000000000003205551337322544400166340ustar00rootroot00000000000000/* * intra.cc * * Description of this file: * Intra prediction functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "block_info.h" #include "intra.h" #include "vec/intrinsic.h" // --------------------------------------------------------------------------- // disable warning #if defined(_MSC_VER) || defined(__ICL) #pragma warning(disable: 4100) // unreferenced formal parameter #endif /* * =========================================================================== * global & local variable defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const int8_t g_aucXYflg[NUM_INTRA_MODE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; /* --------------------------------------------------------------------------- */ static const int8_t tab_auc_dir_dx[NUM_INTRA_MODE] = { 0, 0, 0, 11, 2, 11, 1, 8, 1, 4, 1, 1, 0, 1, 1, 4, 1, 8, 1, 11, 2, 11, 4, 8, 0, 8, 4, 11, 2, 11, 1, 8, 1 }; /* --------------------------------------------------------------------------- */ static const int8_t tab_auc_dir_dy[NUM_INTRA_MODE] = { 0, 0, 0, -4, -1, -8, -1, -11, -2, -11, -4, -8, 0, 8, 4, 11, 2, 11, 1, 8, 1, 4, 1, 1, 0, -1, -1, -4, -1, -8, -1, -11, -2 }; /* --------------------------------------------------------------------------- */ static const int8_t g_aucSign[NUM_INTRA_MODE] = { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; /* --------------------------------------------------------------------------- */ static const int8_t tab_auc_dir_dxdy[2][NUM_INTRA_MODE][2] = { { // dx/dy { 0, 0 }, { 0, 0 }, { 0, 0 }, { 11, 2 }, { 2, 0 }, { 11, 3 }, { 1, 0 }, { 93, 7 }, { 1, 1 }, { 93, 8 }, { 1, 2 }, { 1, 3 }, { 0, 0 }, { 1, 3 }, { 1, 2 }, { 93, 8 }, { 1, 1 }, { 93, 7 }, { 1, 0 }, { 11, 3 }, { 2, 0 }, { 11, 2 }, { 4, 0 }, { 8, 0 }, { 0, 0 }, { 8, 0 }, { 4, 0 }, { 11, 2 }, { 2, 0 }, { 11, 3 }, { 1, 0 }, { 93, 7 }, { 1, 1 }, }, { // dy/dx { 0, 0 }, { 0, 0 }, { 0, 0 }, { 93, 8 }, { 1, 1 }, { 93, 7 }, { 1, 0 }, { 11, 3 }, { 2, 0 }, { 11, 2 }, { 4, 0 }, { 8, 0 }, { 0, 0 }, { 8, 0 }, { 4, 0 }, { 11, 2 }, { 2, 0 }, { 11, 3 }, { 1, 0 }, { 93, 7 }, { 1, 1 }, { 93, 8 }, { 1, 2 }, { 1, 3 }, { 0, 0 }, { 1, 3 }, { 1, 2 }, { 93, 8 }, { 1, 1 }, { 93, 7 }, { 1, 0 }, { 11, 3 }, { 2, 0 } } }; /* --------------------------------------------------------------------------- */ static const int8_t tab_log2size[MAX_CU_SIZE + 1] = { -1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 }; /* --------------------------------------------------------------------------- */ const int8_t tab_DL_Avail64[16 * 16] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_DL_Avail32[8 * 8] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_DL_Avail16[4 * 4] = { 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_DL_Avail8[2 * 2] = { 1, 0, 0, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_TR_Avail64[16 * 16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_TR_Avail32[8 * 8] = { // 0: 8 1:16 2: 32 pu size 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_TR_Avail16[4 * 4] = { 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0 }; /* --------------------------------------------------------------------------- */ const int8_t tab_TR_Avail8[2 * 2] = { 1, 1, 1, 0 }; /* --------------------------------------------------------------------------- */ const int8_t *tab_DL_Avails[MAX_CU_SIZE_IN_BIT + 1] = { NULL, NULL, NULL, tab_DL_Avail8, tab_DL_Avail16, tab_DL_Avail32, tab_DL_Avail64 }; /* --------------------------------------------------------------------------- */ const int8_t *tab_TR_Avails[MAX_CU_SIZE_IN_BIT + 1] = { NULL, NULL, NULL, tab_TR_Avail8, tab_TR_Avail16, tab_TR_Avail32, tab_TR_Avail64 }; /* records the sample bit depth for intra predeiction */ /** * =========================================================================== * local function definition * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int is_block_available(davs2_t *h, int x_4x4, int y_4x4, int dx_4x4, int dy_4x4, int cur_slice_idx) { int x2_4x4 = x_4x4 + dx_4x4; int y2_4x4 = y_4x4 + dy_4x4; if (x2_4x4 < 0 || y2_4x4 < 0 || x2_4x4 >= h->i_width_in_spu || y2_4x4 >= h->i_height_in_spu) { return 0; } else { return cur_slice_idx == h->scu_data[(y2_4x4 >> 1) * h->i_width_in_scu + (x2_4x4 >> 1)].i_slice_nr; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE uint32_t get_intra_neighbors(davs2_t *h, int x_4x4, int y_4x4, int bsx, int bsy, int cur_slice_idx) { /* 1. ڿǷͬһSlice */ int b_LEFT = is_block_available(h, x_4x4, y_4x4, -1, 0, cur_slice_idx); int b_TOP = is_block_available(h, x_4x4, y_4x4, 0, -1, cur_slice_idx); int b_TOP_LEFT = is_block_available(h, x_4x4, y_4x4, -1, -1, cur_slice_idx); int b_LEFT_DOWN = is_block_available(h, x_4x4, y_4x4, -1, (bsy >> 1) - 1, cur_slice_idx); // (bsy >> MIN_PU_SIZE_IN_BIT << 1) int b_TOP_RIGHT = is_block_available(h, x_4x4, y_4x4, (bsx >> 1) - 1, -1, cur_slice_idx); // (bsx >> MIN_PU_SIZE_IN_BIT << 1) int leftdown; int upright; int log2_lcu_size_in_spu = (h->i_lcu_level - B4X4_IN_BIT); int i_lcu_mask = (1 << log2_lcu_size_in_spu) - 1; /* 2. ڿǷڵǰ֮ǰع */ x_4x4 = x_4x4 & i_lcu_mask; y_4x4 = y_4x4 & i_lcu_mask; leftdown = h->p_tab_DL_avail[((y_4x4 + (bsy >> 2) - 1) << log2_lcu_size_in_spu) + (x_4x4)]; upright = h->p_tab_TR_avail[((y_4x4) << log2_lcu_size_in_spu) + (x_4x4 + (bsx >> 2) - 1)]; b_LEFT_DOWN = b_LEFT_DOWN && leftdown; b_TOP_RIGHT = b_TOP_RIGHT && upright; return (b_LEFT << MD_I_LEFT) | (b_TOP << MD_I_TOP) | (b_TOP_LEFT << MD_I_TOP_LEFT) | (b_TOP_RIGHT << MD_I_TOP_RIGHT) | (b_LEFT_DOWN << MD_I_LEFT_DOWN); } /* --------------------------------------------------------------------------- */ static void ALWAYS_INLINE mem_repeat_p(pel_t *dst, pel_t val, size_t num) { while (num--) { *dst++ = val; } } /* --------------------------------------------------------------------------- */ static void ALWAYS_INLINE memcpy_vh_pp_c(pel_t *dst, pel_t *src, int i_src, size_t num) { while (num--) { *dst++ = *src; src += i_src; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ver_c(pel_t *src, pel_t *dst, int i_dst, int mode, int width, int height) { pel_t *p_src = src + 1; int y; UNUSED_PARAMETER(mode); for (y = height; y != 0; y--) { memcpy(dst, p_src, width * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_hor_c(pel_t *src, pel_t *dst, int i_dst, int mode, int width, int height) { pel_t *p_src = src - 1; int x, y; UNUSED_PARAMETER(mode); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { dst[x] = p_src[-y]; } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_dc_c(pel_t *src, pel_t *dst, int i_dst, int mode, int width, int height) { int b_top = mode >> 8; int b_left = mode & 0xFF; pel_t *p_src = src - 1; int dc_value = 0; int x, y; /* get DC value */ if (b_left) { for (y = 0; y < height; y++) { dc_value += p_src[-y]; } p_src = src + 1; if (b_top) { for (x = 0; x < width; x++) { dc_value += p_src[x]; } dc_value += ((width + height) >> 1); dc_value = (dc_value * (512 / (width + height))) >> 9; } else { dc_value += height / 2; dc_value /= height; } } else { p_src = src + 1; if (b_top) { for (x = 0; x < width; x++) { dc_value += p_src[x]; } dc_value += width / 2; dc_value /= width; } else { dc_value = 1 << (g_bit_depth - 1); } } /* fill the block */ x = (1 << g_bit_depth) - 1; /* max pixel value */ dc_value = DAVS2_CLIP3(0, x, dc_value); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { dst[x] = (pel_t)dc_value; } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_plane_c(pel_t *src, pel_t *dst, int i_dst, int mode, int width, int height) { int ib_mult[5] = { 13, 17, 5, 11, 23 }; int ib_shift[5] = { 7, 10, 11, 15, 19 }; int im_h = ib_mult [tab_log2size[width ] - 2]; int im_v = ib_mult [tab_log2size[height] - 2]; int is_h = ib_shift[tab_log2size[width ] - 2]; int is_v = ib_shift[tab_log2size[height] - 2]; int iW2 = width >> 1; int iH2 = height >> 1; int iH = 0; int iV = 0; int iA, iB, iC; int x, y; int iTmp, iTmp2; int max_val = (1 << g_bit_depth) - 1; pel_t *p_src; UNUSED_PARAMETER(mode); p_src = src + 1; p_src += (iW2 - 1); for (x = 1; x < iW2 + 1; x++) { iH += x * (p_src[x] - p_src[-x]); } p_src = src - 1; p_src -= (iH2 - 1); for (y = 1; y < iH2 + 1; y++) { iV += y * (p_src[-y] - p_src[y]); } p_src = src; iA = (p_src[-1 - (height - 1)] + p_src[1 + width - 1]) << 4; iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; for (y = 0; y < height; y++) { iTmp2 = iTmp; for (x = 0; x < width; x++) { dst[x] = (pel_t)DAVS2_CLIP3(0, max_val, iTmp2 >> 5); iTmp2 += iB; } dst += i_dst; iTmp += iC; } } /* --------------------------------------------------------------------------- */ static void intra_pred_bilinear_c(pel_t *src, pel_t *dst, int i_dst, int mode, int width, int height) { itr_t pTop[MAX_CU_SIZE]; itr_t pLeft[MAX_CU_SIZE]; itr_t pT[MAX_CU_SIZE]; itr_t pL[MAX_CU_SIZE]; itr_t wy[MAX_CU_SIZE]; int ishift_x = tab_log2size[width]; int ishift_y = tab_log2size[height]; int ishift = DAVS2_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); int a, b, c, w, wxy, t; int predx; int x, y; int max_value = (1 << g_bit_depth) - 1; UNUSED_PARAMETER(mode); for (x = 0; x < width; x++) { pTop[x] = src[1 + x]; } for (y = 0; y < height; y++) { pLeft[y] = src[-1 - y]; } a = pTop[width - 1]; b = pLeft[height - 1]; c = (width == height) ? (a + b + 1) >> 1 : (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); w = (c << 1) - a - b; for (x = 0; x < width; x++) { pT[x] = (itr_t)(b - pTop[x]); pTop[x] <<= ishift_y; } t = 0; for (y = 0; y < height; y++) { pL[y] = (itr_t)(a - pLeft[y]); pLeft[y] <<= ishift_x; wy[y] = (itr_t)t; t += w; } for (y = 0; y < height; y++) { predx = pLeft[y]; wxy = -wy[y]; for (x = 0; x < width; x++) { predx += pL[y]; wxy += wy[y]; pTop[x] += pT[x]; dst[x] = (pel_t)DAVS2_CLIP3(0, max_value, (((predx << ishift_y) + (pTop[x] << ishift_x) + wxy + offset) >> ishift_xy)); } dst += i_dst; } } /* --------------------------------------------------------------------------- */ static int get_context_pixel(int mode, int uiXYflag, int iTempD, int *offset) { int imult = tab_auc_dir_dxdy[uiXYflag][mode][0]; int ishift = tab_auc_dir_dxdy[uiXYflag][mode][1]; int iTempDn = iTempD * imult >> ishift; *offset = ((iTempD * imult * 32) >> ishift) - iTempDn * 32; return iTempDn; } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int iDx = tab_auc_dir_dx[dir_mode]; int iDy = tab_auc_dir_dy[dir_mode]; #if BUGFIX_PREDICTION_INTRA int iX; #else int top_width = bsx - iDx; int iW2 = (bsx << 1) - 1; int iX, idx; #endif int c1, c2, c3, c4; int i, j; pel_t *dst_base = dst + iDy * i_dst + iDx; for (j = 0; j < bsy; j++, iDy++) { iX = get_context_pixel(dir_mode, 0, j + 1, &c4); c1 = 32 - c4; c2 = 64 - c4; c3 = 32 + c4; #if BUGFIX_PREDICTION_INTRA i = 0; #else if (iDy >= 0 && top_width > 0) { memcpy(dst, dst_base, top_width * sizeof(pel_t)); i = top_width; iX += top_width; } else { i = 0; } #endif for (; i < bsx; i++) { #if BUGFIX_PREDICTION_INTRA dst[i] = (pel_t)((src[iX] * c1 + src[iX + 1] * c2 + src[iX + 2] * c3 + src[iX + 3] * c4 + 64) >> 7); #else idx = DAVS2_MIN(iW2, iX); dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); #endif iX++; } dst_base += i_dst; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int offsets[64]; int xsteps[64]; int iDx = tab_auc_dir_dx[dir_mode]; int iDy = tab_auc_dir_dy[dir_mode]; #if !BUGFIX_PREDICTION_INTRA int iHeight2 = 1 - (bsy << 1); int top_width = bsx - iDx; #endif int i, j; int offset; int iY; pel_t *dst_base = dst + iDy * i_dst + iDx; for (i = 0; i < bsx; i++) { xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &offsets[i]); } for (j = 0; j < bsy; j++) { for (i = 0; i < bsx; i++) { #if !BUGFIX_PREDICTION_INTRA if (j >= -iDy && i < top_width) { dst[i] = dst_base[i]; } else { #endif int idx; iY = j + xsteps[i]; #if BUGFIX_PREDICTION_INTRA idx = -iY; #else idx = DAVS2_MAX(iHeight2, -iY); #endif offset = offsets[i]; dst[i] = (pel_t)((src[idx] * (32 - offset) + src[idx - 1] * (64 - offset) + src[idx - 2] * (32 + offset) + src[idx - 3] * offset + 64) >> 7); #if !BUGFIX_PREDICTION_INTRA } #endif } dst_base += i_dst; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(int xoffsets[64]); ALIGN16(int xsteps[64]); const int iDx = tab_auc_dir_dx[dir_mode]; const int iDy = tab_auc_dir_dy[dir_mode]; pel_t *dst_base = dst - iDy * i_dst - iDx; int i, j, iXx, iYy; int offsetx, offsety; for (i = 0; i < bsx; i++) { xsteps[i] = get_context_pixel(dir_mode, 1, i + 1, &xoffsets[i]); } for (j = 0; j < bsy; j++) { iXx = -get_context_pixel(dir_mode, 0, j + 1, &offsetx); for (i = 0; i < bsx; i++) { #if !BUGFIX_PREDICTION_INTRA if (j >= iDy && i >= iDx) { dst[i] = dst_base[i]; } else { #endif iYy = j - xsteps[i]; if (iYy <= -1) { dst[i] = (pel_t)((src[ iXx + 2] * (32 - offsetx) + src[ iXx + 1] * (64 - offsetx) + src[ iXx] * (32 + offsetx) + src[ iXx - 1] * offsetx + 64) >> 7); } else { offsety = xoffsets[i]; dst[i] = (pel_t)((src[-iYy - 2] * (32 - offsety) + src[-iYy - 1] * (64 - offsety) + src[-iYy] * (32 + offsety) + src[-iYy + 1] * offsety + 64) >> 7); } #if !BUGFIX_PREDICTION_INTRA } #endif iXx++; } dst_base += i_dst; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_3_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[(64 + 176) << 2]); int line_size = bsx + (bsy >> 2) * 11 - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, bsx * 2); #endif int aligned_line_size = 64 + 176; int i_dst4 = i_dst << 2; int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad1, pad2, pad3, pad4; #endif pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src++) { #else for (i = 0; i < real_size; i++, src++) { #endif pfirst[0][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); pfirst[1][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); pfirst[2][i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); pfirst[3][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 0 * src[14] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { int iW2 = (bsx << 1) - 1; int j; src -= real_size; pad1 = (pel_t)(( src[iW2] + 5 * src[iW2 + 1] + 7 * src[iW2 + 2] + 3 * src[iW2 + 3] + 8) >> 4); pad2 = (pel_t)(( src[iW2] + 3 * src[iW2 + 1] + 3 * src[iW2 + 2] + src[iW2 + 3] + 4) >> 3); pad3 = (pel_t)((3 * src[iW2] + 7 * src[iW2 + 1] + 5 * src[iW2 + 2] + src[iW2 + 3] + 8) >> 4); pad4 = (pel_t)(( src[iW2] + 2 * src[iW2 + 1] + src[iW2 + 2] + 0 * src[iW2 + 3] + 2) >> 2); for (j = real_size - 1; j > iW2 - 2; j--) { pfirst[3][j] = pad4; pfirst[2][j] = pad3; pfirst[1][j] = pad2; pfirst[0][j] = pad1; } for (; j > iW2 - 5; j--) { pfirst[3][j] = pad4; pfirst[2][j] = pad3; pfirst[1][j] = pad2; } for (; j > iW2 - 8; j--) { pfirst[3][j] = pad4; pfirst[2][j] = pad3; } for (; j > iW2 - 11; j--) { pfirst[3][j] = pad4; } for (; i < line_size; i++) { pfirst[0][i] = pad1; pfirst[1][i] = pad2; pfirst[2][i] = pad3; pfirst[3][i] = pad4; } } #endif bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst += i_dst4; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_4_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, (bsx << 1) - 2); #endif int iHeight2 = bsy << 1; int i; src += 3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src++) { #else for (i = 0; i < real_size; i++, src++) { #endif first_line[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding for (; i < line_size; i++) { first_line[i] = first_line[real_size - 1]; } #endif for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_5_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (((bsy > 4) && (bsx > 8))) { ALIGN16(pel_t first_line[(64 + 80) << 3]); #if !BUGFIX_PREDICTION_INTRA int iW2 = bsx * 2 - 1; #endif int line_size = bsx + (((bsy - 8) * 11) >> 3); #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, iW2 + 1); #endif int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; #if !BUGFIX_PREDICTION_INTRA pel_t *src_org = src; #endif pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; src++, i++) { #else for (i = 0; i < real_size; src++, i++) { #endif pfirst[0][i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); pfirst[1][i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); pfirst[2][i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); pfirst[3][i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); pfirst[4][i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); pfirst[5][i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); pfirst[6][i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); pfirst[7][i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA //padding if (((real_size - 1) + 11) > iW2) { src = src_org + iW2; pel_t pad1 = pfirst[0][iW2 - 1]; pel_t pad2 = pfirst[1][iW2 - 2]; pel_t pad3 = pfirst[2][iW2 - 4]; pel_t pad4 = pfirst[3][iW2 - 5]; pel_t pad5 = pfirst[4][iW2 - 6]; pel_t pad6 = pfirst[5][iW2 - 8]; pel_t pad7 = pfirst[6][iW2 - 9]; pel_t pad8 = pfirst[7][iW2 - 11]; int start1 = iW2; int start2 = iW2 - 1; int start3 = iW2 - 3; int start4 = iW2 - 4; int start5 = iW2 - 5; int start6 = iW2 - 7; int start7 = iW2 - 8; int start8 = iW2 - 10; for (i = start1; i < line_size; i++) { pfirst[0][i] = pad1; } for (i = start2; i < line_size; i++) { pfirst[1][i] = pad2; } for (i = start3; i < line_size; i++) { pfirst[2][i] = pad3; } for (i = start4; i < line_size; i++) { pfirst[3][i] = pad4; } for (i = start5; i < line_size; i++) { pfirst[4][i] = pad5; } for (i = start6; i < line_size; i++) { pfirst[5][i] = pad6; } for (i = start7; i < line_size; i++) { pfirst[6][i] = pad7; } for (i = start8; i < line_size; i++) { pfirst[7][i] = pad8; } } #endif bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } } else if (bsx == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < 8; src++, i++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); dst5[i] = (pel_t)(( src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); dst6[i] = (pel_t)(( 3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); dst7[i] = (pel_t)(( 3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); dst8[i] = (pel_t)(( src[11] + 2 * src[12] + src[13] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA dst7[7] = dst7[6]; dst8[7] = dst8[4]; dst8[6] = dst8[4]; dst8[5] = dst8[4]; #endif if (bsy == 32) { //src -> 8,src[8] -> 16 #if BUGFIX_PREDICTION_INTRA pel_t pad1 = src[8]; dst1 = dst8 + i_dst; int j; for (j = 0; j < 24; j++) { for (i = 0; i < 8; i++) { dst1[i] = pad1; } dst1 += i_dst; } dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; src += 4; dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); #else //src -> 8,src[7] -> 15 pel_t pad1 = (pel_t)((5 * src[7] + 13 * src[8] + 11 * src[9] + 3 * src[10] + 16) >> 5); pel_t pad2 = (pel_t)((src[7] + 5 * src[8] + 7 * src[9] + 3 * src[10] + 8) >> 4); pel_t pad3 = (pel_t)((7 * src[7] + 15 * src[8] + 9 * src[9] + src[10] + 16) >> 5); pel_t pad4 = (pel_t)((src[7] + 3 * src[8] + 3 * src[9] + src[10] + 4) >> 3); pel_t pad5 = (pel_t)((src[7] + 9 * src[8] + 15 * src[9] + 7 * src[10] + 16) >> 5); pel_t pad6 = dst6[7]; pel_t pad7 = dst7[7]; pel_t pad8 = dst8[7]; dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; for (i = 0; i < 8; i++) { dst1[i] = pad1; dst2[i] = pad2; dst3[i] = pad3; dst4[i] = pad4; dst5[i] = pad5; dst6[i] = pad6; dst7[i] = pad7; dst8[i] = pad8; } src += 4; dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); dst2[0] = (pel_t)(( src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); dst2[1] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; for (i = 0; i < 8; i++) { dst1[i] = pad1; dst2[i] = pad2; dst3[i] = pad3; dst4[i] = pad4; dst5[i] = pad5; dst6[i] = pad6; dst7[i] = pad7; dst8[i] = pad8; } dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; for (i = 0; i < 8; i++) { dst1[i] = pad1; dst2[i] = pad2; dst3[i] = pad3; dst4[i] = pad4; dst5[i] = pad5; dst6[i] = pad6; dst7[i] = pad7; dst8[i] = pad8; } #endif } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < 4; i++, src++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)(( src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + src[7] + 16) >> 5); dst4[i] = (pel_t)(( src[5] + 3 * src[6] + 3 * src[7] + src[8] + 4) >> 3); } #if !BUGFIX_PREDICTION_INTRA dst4[3] = dst4[2]; #endif if (bsy == 16) { #if BUGFIX_PREDICTION_INTRA pel_t *dst5 = dst4 + i_dst; src += 4; pel_t pad1 = src[0]; int j; for (j = 0; j < 12; j++) { for (i = 0; i < 4; i++) { dst5[i] = pad1; } dst5 += i_dst; } dst5 = dst4 + i_dst; dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst5[1] = (pel_t)((src[-1] + 9 * src[ 0] + 15 * src[1] + 7 * src[2] + 16) >> 5); #else pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; src += 3; pel_t pad1 = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); pel_t pad2 = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); pel_t pad3 = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + 1 * src[3] + 16) >> 5); pel_t pad4 = dst4[3]; pel_t pad5 = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); pel_t pad6 = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); pel_t pad7 = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); pel_t pad8 = (pel_t)(( src[0] + 2 * src[1] + src[2] + 2) >> 2); for (i = 0; i < 4; i++) { dst5[i] = pad5; dst6[i] = pad6; dst7[i] = pad7; dst8[i] = pad8; } dst5[0] = (pel_t)((src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; for (i = 0; i < 4; i++) { dst1[i] = pad1; dst2[i] = pad2; dst3[i] = pad3; dst4[i] = pad4; dst5[i] = pad5; dst6[i] = pad6; dst7[i] = pad7; dst8[i] = pad8; } #endif } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_6_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, (bsx << 1) - 1); #endif int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad; #endif #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src++) { #else for (i = 0; i < real_size; i++, src++) { #endif first_line[i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding pad = first_line[real_size - 1]; for (; i < line_size; i++) { first_line[i] = pad; } #endif for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_7_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; if (bsy == 4) { for (i = 0; i < bsx; src++, i++){ dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); } } else if (bsy == 8) { pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; src++, i++){ dst1[i] = (pel_t)((src[0] * 9 + src[1] * 41 + src[2] * 55 + src[3] * 23 + 64) >> 7); dst2[i] = (pel_t)((src[1] * 9 + src[2] * 25 + src[3] * 23 + src[4] * 7 + 32) >> 6); dst3[i] = (pel_t)((src[2] * 27 + src[3] * 59 + src[4] * 37 + src[5] * 5 + 64) >> 7); dst4[i] = (pel_t)((src[2] * 3 + src[3] * 35 + src[4] * 61 + src[5] * 29 + 64) >> 7); dst5[i] = (pel_t)((src[3] * 3 + src[4] * 11 + src[5] * 13 + src[6] * 5 + 16) >> 5); dst6[i] = (pel_t)((src[4] * 21 + src[5] * 53 + src[6] * 43 + src[7] * 11 + 64) >> 7); dst7[i] = (pel_t)((src[5] * 15 + src[6] * 31 + src[7] * 17 + src[8] + 32) >> 6); dst8[i] = (pel_t)((src[5] * 3 + src[6] * 19 + src[7] * 29 + src[8] * 13 + 32) >> 6); } } else { intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_8_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, bsx * 2); #endif int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad1, pad2; #endif pel_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src++) { #else for (i = 0; i < real_size; i++, src++) { #endif pfirst[0][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); pfirst[1][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { pfirst[1][real_size - 1] = pfirst[1][real_size - 2]; pad1 = pfirst[0][real_size - 1]; pad2 = pfirst[1][real_size - 1]; for (; i < line_size; i++) { pfirst[0][i] = pad1; pfirst[1][i] = pad2; } } #endif bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_9_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsy > 8){ intra_pred_ang_x_c(src, dst, i_dst, dir_mode, bsx, bsy); /* ALIGN16(pel_t first_line[(64 + 32) * 11]); int line_size = bsx + (bsy * 93 >> 8) - 1; int real_size = STARAVS_MIN(line_size, bsx * 2); int aligned_line_size = ((line_size + 31) >> 5) << 5; int i_dst11 = i_dst * 11; int i; pel_t pad1, pad2, pad3, pad4, pad5, pad6, pad7, pad8, pad9, pad10, pad11; pel_t *pfirst[11]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; pfirst[8] = pfirst[7] + aligned_line_size; pfirst[9] = pfirst[8] + aligned_line_size; pfirst[10] = pfirst[9] + aligned_line_size; for (i = 0; i < real_size; i++, src++) { pfirst[0][i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); pfirst[1][i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); pfirst[2][i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + 1 * src[4] + 32) >> 6); pfirst[3][i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); pfirst[4][i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); pfirst[5][i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); pfirst[6][i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); pfirst[7][i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); pfirst[8][i] = (pel_t)((3 * src[3] + 7 * src[4] + 5 * src[5] + 1 * src[6] + 8) >> 4); pfirst[9][i] = (pel_t)((3 * src[3] + 11 * src[4] + 13 * src[5] + 5 * src[6] + 16) >> 5); pfirst[10][i] = (pel_t)((1 * src[3] + 33 * src[4] + 63 * src[5] + 31 * src[6] + 64) >> 7); } // padding if (real_size < line_size) { pfirst[8][real_size - 3] = pfirst[8][real_size - 4]; pfirst[9][real_size - 3] = pfirst[9][real_size - 4]; pfirst[10][real_size - 3] = pfirst[10][real_size - 4]; pfirst[8][real_size - 2] = pfirst[8][real_size - 3]; pfirst[9][real_size - 2] = pfirst[9][real_size - 3]; pfirst[10][real_size - 2] = pfirst[10][real_size - 3]; pfirst[8][real_size - 1] = pfirst[8][real_size - 2]; pfirst[9][real_size - 1] = pfirst[9][real_size - 2]; pfirst[10][real_size - 1] = pfirst[10][real_size - 2]; pfirst[5][real_size - 2] = pfirst[5][real_size - 3]; pfirst[6][real_size - 2] = pfirst[6][real_size - 3]; pfirst[7][real_size - 2] = pfirst[7][real_size - 3]; pfirst[5][real_size - 1] = pfirst[5][real_size - 2]; pfirst[6][real_size - 1] = pfirst[6][real_size - 2]; pfirst[7][real_size - 1] = pfirst[7][real_size - 2]; pfirst[2][real_size - 1] = pfirst[2][real_size - 2]; pfirst[3][real_size - 1] = pfirst[3][real_size - 2]; pfirst[4][real_size - 1] = pfirst[4][real_size - 2]; pad1 = pfirst[0][real_size - 1]; pad2 = pfirst[1][real_size - 1]; pad3 = pfirst[2][real_size - 1]; pad4 = pfirst[3][real_size - 1]; pad5 = pfirst[4][real_size - 1]; pad6 = pfirst[5][real_size - 1]; pad7 = pfirst[6][real_size - 1]; pad8 = pfirst[7][real_size - 1]; pad9 = pfirst[8][real_size - 1]; pad10 = pfirst[9][real_size - 1]; pad11 = pfirst[10][real_size - 1]; for (; i < line_size; i++) { pfirst[0][i] = pad1; pfirst[1][i] = pad2; pfirst[2][i] = pad3; pfirst[3][i] = pad4; pfirst[4][i] = pad5; pfirst[5][i] = pad6; pfirst[6][i] = pad7; pfirst[7][i] = pad8; pfirst[8][i] = pad9; pfirst[9][i] = pad10; pfirst[10][i] = pad11; } } int bsy_b = bsy / 11; for (i = 0; i < bsy_b; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); memcpy(dst + 8 * i_dst, pfirst[8] + i, bsx * sizeof(pel_t)); memcpy(dst + 9 * i_dst, pfirst[9] + i, bsx * sizeof(pel_t)); memcpy(dst + 10 * i_dst, pfirst[10] + i, bsx * sizeof(pel_t)); dst += i_dst11; } int bsy_r = bsy - bsy_b * 11; for (i = 0; i < bsy_r; i++) { memcpy(dst, pfirst[i] + bsy_b, bsx * sizeof(pel_t)); dst += i_dst; } */ } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (int i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); dst5[i] = (pel_t)((3 * src[1] + 19 * src[2] + 29 * src[3] + 13 * src[4] + 32) >> 6); dst6[i] = (pel_t)((27 * src[2] + 59 * src[3] + 37 * src[4] + 5 * src[5] + 64) >> 7); dst7[i] = (pel_t)((15 * src[2] + 47 * src[3] + 49 * src[4] + 17 * src[5] + 64) >> 7); dst8[i] = (pel_t)((3 * src[2] + 35 * src[3] + 61 * src[4] + 29 * src[5] + 64) >> 7); } } else /*if (bsy == 4)*/ { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (int i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((21 * src[0] + 53 * src[1] + 43 * src[2] + 11 * src[3] + 64) >> 7); dst2[i] = (pel_t)((9 * src[0] + 41 * src[1] + 55 * src[2] + 23 * src[3] + 64) >> 7); dst3[i] = (pel_t)((15 * src[1] + 31 * src[2] + 17 * src[3] + src[4] + 32) >> 6); dst4[i] = (pel_t)((9 * src[1] + 25 * src[2] + 23 * src[3] + 7 * src[4] + 32) >> 6); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_10_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; int i; if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 16)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); pfirst[1][i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); pfirst[2][i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); pfirst[3][i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } bsy >>= 2; i_dst <<= 2; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i, bsx * sizeof(pel_t)); dst1 += i_dst; dst2 += i_dst; dst3 += i_dst; dst4 += i_dst; } } else { for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((src[0] * 3 + src[1] * 7 + src[2] * 5 + src[3] + 8) >> 4); dst2[i] = (pel_t)((src[0] + (src[1] + src[2]) * 3 + src[3] + 4) >> 3); dst3[i] = (pel_t)((src[0] + src[1] * 5 + src[2] * 7 + src[3] * 3 + 8) >> 4); dst4[i] = (pel_t)((src[1] + src[2] * 2 + src[3] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_x_11_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst8 = i_dst << 3; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; for (i = 0; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); pfirst[1][i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); pfirst[2][i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); pfirst[3][i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); pfirst[4][i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); pfirst[5][i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); pfirst[6][i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); pfirst[7][i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + 0 * src[4] + 2) >> 2); } bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); memcpy(dst + 2 * i_dst, pfirst[2] + i, bsx * sizeof(pel_t)); memcpy(dst + 3 * i_dst, pfirst[3] + i, bsx * sizeof(pel_t)); memcpy(dst + 4 * i_dst, pfirst[4] + i, bsx * sizeof(pel_t)); memcpy(dst + 5 * i_dst, pfirst[5] + i, bsx * sizeof(pel_t)); memcpy(dst + 6 * i_dst, pfirst[6] + i, bsx * sizeof(pel_t)); memcpy(dst + 7 * i_dst, pfirst[7] + i, bsx * sizeof(pel_t)); dst += i_dst8; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); dst2[i] = (pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); dst3[i] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); dst5[i] = (pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5); dst6[i] = (pel_t)(( src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4); dst7[i] = (pel_t)(( src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5); dst8[i] = (pel_t)(( src[1] + 2 * src[2] + src[3] + + 2) >> 2); } } else { for (i = 0; i < bsx; i++, src++) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; dst1[i] = (pel_t)(( 7 * src[0] + 15 * src[1] + 9 * src[2] + src[3] + 16) >> 5); dst2[i] = (pel_t)(( 3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4); dst3[i] = (pel_t)(( 5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst4[i] = (pel_t)(( src[0] + 3 * src[1] + 3 * src[2] + src[3] + 4) >> 3); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_25_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { ALIGN16(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; for (i = 0; i < line_size; i += 8, src--) { first_line[0 + i] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); first_line[1 + i] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); first_line[2 + i] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); first_line[3 + i] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); first_line[4 + i] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); first_line[5 + i] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); first_line[6 + i] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); first_line[7 + i] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); } for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); dst[4] = (pel_t)((src[0] * 3 + src[-1] * 11 + src[-2] * 13 + src[-3] * 5 + 16) >> 5); dst[5] = (pel_t)((src[0] * 1 + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); dst[6] = (pel_t)((src[0] * 1 + src[-1] * 9 + src[-2] * 15 + src[-3] * 7 + 16) >> 5); dst[7] = (pel_t)(( src[-1] * 1 + src[-2] * 2 + src[-3] * 1 + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 7 + src[-1] * 15 + src[-2] * 9 + src[-3] * 1 + 16) >> 5); dst[1] = (pel_t)((src[0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] * 1 + 8) >> 4); dst[2] = (pel_t)((src[0] * 5 + src[-1] * 13 + src[-2] * 11 + src[-3] * 3 + 16) >> 5); dst[3] = (pel_t)((src[0] * 1 + src[-1] * 3 + src[-2] * 3 + src[-3] * 1 + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_26_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { ALIGN16(pel_t first_line[64 + 256]); int line_size = bsx + ((bsy - 1) << 2); int iHeight4 = bsy << 2; for (i = 0; i < line_size; i += 4, src--) { first_line[i ] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); first_line[i + 1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); first_line[i + 2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); first_line[i + 3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); } for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[ 0] * 3 + src[-1] * 7 + src[-2] * 5 + src[-3] + 8) >> 4); dst[1] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); dst[2] = (pel_t)((src[ 0] + src[-1] * 5 + src[-2] * 7 + src[-3] * 3 + 8) >> 4); dst[3] = (pel_t)((src[-1] + src[-2] * 2 + src[-3] + 2) >> 2); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_27_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8){ intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx == 8){ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); dst[4] = (pel_t)(( 3 * src[-1] + 19 * src[-2] + 29 * src[-3] + 13 * src[-4] + 32) >> 6); dst[5] = (pel_t)((27 * src[-2] + 59 * src[-3] + 37 * src[-4] + 5 * src[-5] + 64) >> 7); dst[6] = (pel_t)((15 * src[-2] + 47 * src[-3] + 49 * src[-4] + 17 * src[-5] + 64) >> 7); dst[7] = (pel_t)(( 3 * src[-2] + 35 * src[-3] + 61 * src[-4] + 29 * src[-5] + 64) >> 7); dst += i_dst; } } else{ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((21 * src[0] + 53 * src[-1] + 43 * src[-2] + 11 * src[-3] + 64) >> 7); dst[1] = (pel_t)(( 9 * src[0] + 41 * src[-1] + 55 * src[-2] + 23 * src[-3] + 64) >> 7); dst[2] = (pel_t)((15 * src[-1] + 31 * src[-2] + 17 * src[-3] + 1 * src[-4] + 32) >> 6); dst[3] = (pel_t)(( 9 * src[-1] + 25 * src[-2] + 23 * src[-3] + 7 * src[-4] + 32) >> 6); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_28_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, (bsy << 2)); #endif int iHeight2 = bsy << 1; int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad1, pad2; #endif #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i += 2, src--) { #else for (i = 0; i < real_size; i += 2, src--) { #endif first_line[i ] = (pel_t)((src[ 0] + (src[-1] + src[-2]) * 3 + src[-3] + 4) >> 3); first_line[i + 1] = (pel_t)((src[-1] + (src[-2] << 1) + src[-3] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { first_line[i - 1] = first_line[i - 3]; pad1 = first_line[i - 2]; pad2 = first_line[i - 1]; for (; i < line_size; i += 2) { first_line[i ] = pad1; first_line[i + 1] = pad2; } } #endif for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_29_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { intra_pred_ang_y_c(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); dst[4] = (pel_t)((src[-3] * 3 + src[-4] * 11 + src[-5] * 13 + src[-6] * 5 + 16) >> 5); dst[5] = (pel_t)((src[-4] * 21 + src[-5] * 53 + src[-6] * 43 + src[-7] * 11 + 64) >> 7); dst[6] = (pel_t)((src[-5] * 15 + src[-6] * 31 + src[-7] * 17 + src[-8] + 32) >> 6); dst[7] = (pel_t)((src[-5] * 3 + src[-6] * 19 + src[-7] * 29 + src[-8] * 13 + 32) >> 6); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[0] * 9 + src[-1] * 41 + src[-2] * 55 + src[-3] * 23 + 64) >> 7); dst[1] = (pel_t)((src[-1] * 9 + src[-2] * 25 + src[-3] * 23 + src[-4] * 7 + 32) >> 6); dst[2] = (pel_t)((src[-2] * 27 + src[-3] * 59 + src[-4] * 37 + src[-5] * 5 + 64) >> 7); dst[3] = (pel_t)((src[-2] * 3 + src[-3] * 35 + src[-4] * 61 + src[-5] * 29 + 64) >> 7); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_30_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, (bsy << 1) - 1); #endif int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad; #endif src -= 2; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src--) { #else for (i = 0; i < real_size; i++, src--) { #endif first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding pad = first_line[real_size - 1]; for (; i < line_size; i++) { first_line[i] = pad; } #endif for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_31_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); ALIGN16(pel_t src_tran[MAX_CU_SIZE << 3]); int i; if (bsx >= bsy){ // transposition #if BUGFIX_PREDICTION_INTRA //i < (bsx * 19 / 8 + 3) for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++){ #else for (i = 0; i < (2 * bsy + 3); i++){ #endif src_tran[i] = src[-i]; } intra_pred_ang_x_5_c(src_tran, dst_tran, bsy, 5, bsy, bsx); for (i = 0; i < bsy; i++){ for (int j = 0; j < bsx; j++){ dst[j + i_dst * i] = dst_tran[i + bsy * j]; } } } else if (bsx == 8){ for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst[4] = (pel_t)((1 * src[-6] + 9 * src[-7] + 15 * src[-8] + 7 * src[-9] + 16) >> 5); dst[5] = (pel_t)((3 * src[-8] + 7 * src[-9] + 5 * src[-10] + 1 * src[-11] + 8) >> 4); dst[6] = (pel_t)((3 * src[-9] + 11 * src[-10] + 13 * src[-11] + 5 * src[-12] + 16) >> 5); dst[7] = (pel_t)((1 * src[-11] + 2 * src[-12] + 1 * src[-13] + 0 * src[-14] + 2) >> 2); dst += i_dst; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((5 * src[-1] + 13 * src[-2] + 11 * src[-3] + 3 * src[-4] + 16) >> 5); dst[1] = (pel_t)((1 * src[-2] + 5 * src[-3] + 7 * src[-4] + 3 * src[-5] + 8) >> 4); dst[2] = (pel_t)((7 * src[-4] + 15 * src[-5] + 9 * src[-6] + 1 * src[-7] + 16) >> 5); dst[3] = (pel_t)((1 * src[-5] + 3 * src[-6] + 3 * src[-7] + 1 * src[-8] + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_y_32_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (32 + 64)]); int line_size = (bsy >> 1) + bsx - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = STARAVS_MIN(line_size, bsy - 1); #endif int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; int i; #if !BUGFIX_PREDICTION_INTRA pel_t pad; #endif pel_t *pfirst[2]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size; i++, src -= 2) { #else for (i = 0; i < real_size; i++, src -= 2) { #endif pfirst[0][i] = (pel_t)((src[1] + (src[ 0] << 1) + src[-1] + 2) >> 2); pfirst[1][i] = (pel_t)((src[0] + (src[-1] << 1) + src[-2] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA // padding pad = pfirst[1][i - 1]; for (; i < line_size; i++) { pfirst[0][i] = pad; pfirst[1][i] = pad; } #endif bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_13_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) { pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); pfirst[1][i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); pfirst[2][i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); pfirst[3][i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); pfirst[4][i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); pfirst[5][i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); pfirst[6][i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); pfirst[7][i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); dst4[i] = (pel_t)((src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); dst5[i] = (pel_t)((3 * src[2] + 11 * src[1] + 13 * src[0] + 5 * src[-1] + 16) >> 5); dst6[i] = (pel_t)((src[2] + 5 * src[1] + 7 * src[0] + 3 * src[-1] + 8) >> 4); dst7[i] = (pel_t)((src[2] + 9 * src[1] + 15 * src[0] + 7 * src[-1] + 16) >> 5); dst8[i] = (pel_t)((src[1] + 2 * src[0] + src[-1] + 2) >> 2); } } else { for (i = 0; i < bsx; i++, src++) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; dst1[i] = (pel_t)((7 * src[2] + 15 * src[1] + 9 * src[0] + src[-1] + 16) >> 5); dst2[i] = (pel_t)((3 * src[2] + 7 * src[1] + 5 * src[0] + src[-1] + 8) >> 4); dst3[i] = (pel_t)((5 * src[2] + 13 * src[1] + 11 * src[0] + 3 * src[-1] + 16) >> 5); dst4[i] = (pel_t)(( src[2] + 3 * src[1] + 3 * src[0] + src[-1] + 4) >> 3); } } } static void intra_pred_ang_xy_14_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 16)]); int line_size = bsx + (bsy >> 2) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; for (i = 0; i < left_size; i++, src += 4) { pfirst[0][i] = (pel_t)((src[ 2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[1][i] = (pel_t)((src[ 1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[2][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[3][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); pfirst[1][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); pfirst[2][i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); pfirst[3][i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; for (i = 0; i < bsx; i++, src++) { dst1[i] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); dst2[i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); dst3[i] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); dst4[i] = (pel_t)((src[-1] + src[0] * 2 + src[1] + 2) >> 2); } } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_16_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 32)]); int line_size = bsx + (bsy >> 1) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; int i_dst2 = i_dst << 1; pel_t *pfirst[2]; int i; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; for (i = 0; i < left_size; i++, src += 2) { pfirst[0][i] = (pel_t)((src[ 0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size; i++, src++) { pfirst[0][i] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); pfirst[1][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; for (i = 0; i < bsy; i++) { memcpy(dst , pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst2; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_18_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; src -= bsy - 1; for (i = 0; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst--; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_20_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int left_size = ((bsy - 1) << 1) + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; src -= bsy; for (i = 0; i < left_size; i += 2, src++) { first_line[i ] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); first_line[i + 1] = (pel_t)(( src[0] + (src[1] << 1) + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_22_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx != 4) { src -= bsy; ALIGN16(pel_t first_line[64 + 256]); int left_size = ((bsy - 1) << 2) + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; for (i = 0; i < left_size; i += 4, src++) { first_line[i ] = (pel_t)((src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4); first_line[i + 1] = (pel_t)((src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3); first_line[i + 2] = (pel_t)((src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4); first_line[i + 3] = (pel_t)(( src[0] + src[1] * 2 + src[2] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); dst[1] = (pel_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); dst[2] = (pel_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); dst[3] = (pel_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); dst += i_dst; } // needn't pad, (3,0) is equal for ang_x and ang_y } } /* --------------------------------------------------------------------------- */ static void intra_pred_ang_xy_23_c(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; if (bsx > 8) { ALIGN16(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; src -= bsy; for (i = 0; i < left_size; i += 8, src++) { first_line[i ] = (pel_t)((7 * src[-1] + 15 * src[0] + 9 * src[1] + src[2] + 16) >> 5); first_line[i + 1] = (pel_t)((3 * src[-1] + 7 * src[0] + 5 * src[1] + src[2] + 8) >> 4); first_line[i + 2] = (pel_t)((5 * src[-1] + 13 * src[0] + 11 * src[1] + 3 * src[2] + 16) >> 5); first_line[i + 3] = (pel_t)(( src[-1] + 3 * src[0] + 3 * src[1] + src[2] + 4) >> 3); first_line[i + 4] = (pel_t)((3 * src[-1] + 11 * src[0] + 13 * src[1] + 5 * src[2] + 16) >> 5); first_line[i + 5] = (pel_t)(( src[-1] + 5 * src[0] + 7 * src[1] + 3 * src[2] + 8) >> 4); first_line[i + 6] = (pel_t)(( src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); first_line[i + 7] = (pel_t)(( src[ 0] + 2 * src[1] + src[2] + 0 * src[3] + 2) >> 2); } i--; for (; i < line_size; i++, src++) { first_line[i] = (pel_t)((src[1] + (src[0] << 1) + src[-1] + 2) >> 2); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; } } else if (bsx == 8) { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); dst[3] = (pel_t)(( src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); dst[4] = (pel_t)((3 * src[-2] + 11 * src[-1] + 13 * src[0] + 5 * src[1] + 16) >> 5); dst[5] = (pel_t)(( src[-2] + 5 * src[-1] + 7 * src[0] + 3 * src[1] + 8) >> 4); dst[6] = (pel_t)(( src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst[7] = (pel_t)(( src[-1] + 2 * src[ 0] + src[1] + 0 * src[2] + 2) >> 2); dst += i_dst; } // needn't pad, (7,0) is equal for ang_x and ang_y } else { for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((7 * src[-2] + 15 * src[-1] + 9 * src[0] + src[1] + 16) >> 5); dst[1] = (pel_t)((3 * src[-2] + 7 * src[-1] + 5 * src[0] + src[1] + 8) >> 4); dst[2] = (pel_t)((5 * src[-2] + 13 * src[-1] + 11 * src[0] + 3 * src[1] + 16) >> 5); dst[3] = (pel_t)((src[-2] + 3 * src[-1] + 3 * src[0] + src[1] + 4) >> 3); dst += i_dst; } } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUϱ߽PU */ static void fill_reference_samples_0_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { int num_padding = 0; /* fill default value */ mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ gf_davs2.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { gf_davs2.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else { mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUϱ߽PU */ static void fill_reference_samples_x_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ gf_davs2.fast_memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { gf_davs2.fast_memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else { mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; const pel_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ static void fill_reference_samples_y_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pT = pTL + 1; int num_padding = 0; /* fill default value */ mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ gf_davs2.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { gf_davs2.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else { mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCUڲڱ߽ϵPU */ static void fill_reference_samples_xy_c(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { const pel_t *pT = pTL + 1; const pel_t *pL = pTL + i_TL; int num_padding = 0; /* fill default value */ mem_repeat_p(&EP[-(bsy << 1)], g_dc_value, ((bsy + bsx) << 1) + 1); /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ gf_davs2.fast_memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { gf_davs2.fast_memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else { mem_repeat_p(&EP[bsx + 1], EP[bsx], bsx); // repeat the last pixel } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { mem_repeat_p(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { int y; const pel_t *p_l = pL + bsy * i_TL; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { mem_repeat_p(&EP[-(bsy << 1)], EP[-bsy], bsy); } /* fill top-left pixel */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pTL[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { mem_repeat_p(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } } /* --------------------------------------------------------------------------- * make intra prediction for luma block */ void davs2_get_intra_pred(davs2_row_rec_t *row_rec, cu_t *p_cu, int predmode, int ctu_x, int ctu_y, int bsx, int bsy) { const int xy = ((ctu_y != 0) << 1) + (ctu_x != 0); pel_t *EP = row_rec->buf_edge_pixels + (MAX_CU_SIZE << 2) - 1; int b8_x = (ctu_x >> MIN_PU_SIZE_IN_BIT) + row_rec->ctu.i_spu_x; int b8_y = (ctu_y >> MIN_PU_SIZE_IN_BIT) + row_rec->ctu.i_spu_y; int i_pred = row_rec->ctu.i_fdec[0]; pel_t *p_pred = row_rec->ctu.p_fdec[0] + ctu_y * i_pred + ctu_x; pel_t *pTL; int i_src; uint32_t avail; assert(predmode >= 0 && predmode < NUM_INTRA_MODE); avail = get_intra_neighbors(row_rec->h, b8_x, b8_y, bsx, bsy, p_cu->i_slice_nr); row_rec->b_block_avail_top = (bool_t)IS_NEIGHBOR_AVAIL(avail, MD_I_TOP ); // used for second transform row_rec->b_block_avail_left = (bool_t)IS_NEIGHBOR_AVAIL(avail, MD_I_LEFT); // used for second transform i_src = i_pred; pTL = p_pred - i_src - 1; gf_davs2.fill_edge_f[xy](pTL, i_src, row_rec->ctu_border[0].rec_top + ctu_x - ctu_y, EP, avail, bsx, bsy); intra_pred(EP, p_pred, i_pred, predmode, bsy, bsx, avail); } /* --------------------------------------------------------------------------- * make intra prediction for chroma block */ void davs2_get_intra_pred_chroma(davs2_row_rec_t *row_rec, cu_t *p_cu, int ctu_c_x, int ctu_c_y) { static const int TAB_CHROMA_MODE_TO_REAL_MODE[NUM_INTRA_MODE_CHROMA] = { DC_PRED, DC_PRED, HOR_PRED, VERT_PRED, BI_PRED }; const int xy = ((ctu_c_y != 0) << 1) + (ctu_c_x != 0); pel_t *EP_u = row_rec->buf_edge_pixels + (MAX_CU_SIZE << 1) - 1; pel_t *EP_v = EP_u + (MAX_CU_SIZE << 2); int bsize_c = 1 << (p_cu->i_cu_level - 1); int b8_x = ((ctu_c_x << 1) >> MIN_PU_SIZE_IN_BIT) + row_rec->ctu.i_spu_x; int b8_y = ((ctu_c_y << 1) >> MIN_PU_SIZE_IN_BIT) + row_rec->ctu.i_spu_y; int luma_mode = p_cu->intra_pred_modes[0]; int chroma_mode = p_cu->c_ipred_mode; int real_mode = (chroma_mode == DM_PRED_C) ? luma_mode : TAB_CHROMA_MODE_TO_REAL_MODE[chroma_mode]; uint32_t avail; /* Ԥλ */ int i_pred = row_rec->ctu.i_fdec[1]; pel_t *p_pred_u = row_rec->ctu.p_fdec[1] + ctu_c_y * i_pred + ctu_c_x; pel_t *p_pred_v = row_rec->ctu.p_fdec[2] + ctu_c_y * i_pred + ctu_c_x; /* UVϽصλ */ int i_src = i_pred; pel_t *pTL_u = p_pred_u - i_src - 1; pel_t *pTL_v = p_pred_v - i_src - 1; /* ο߽жο߽ */ avail = get_intra_neighbors(row_rec->h, b8_x, b8_y, bsize_c << 1, bsize_c << 1, p_cu->i_slice_nr); gf_davs2.fill_edge_f[xy](pTL_u, i_src, row_rec->ctu_border[1].rec_top + ctu_c_x - ctu_c_y, EP_u, avail, bsize_c, bsize_c); gf_davs2.fill_edge_f[xy](pTL_v, i_src, row_rec->ctu_border[2].rec_top + ctu_c_x - ctu_c_y, EP_v, avail, bsize_c, bsize_c); /* ִԤ */ intra_pred(EP_u, p_pred_u, i_pred, real_mode, bsize_c, bsize_c, avail); intra_pred(EP_v, p_pred_v, i_pred, real_mode, bsize_c, bsize_c, avail); } /* --------------------------------------------------------------------------- */ void davs2_intra_pred_init(uint32_t cpuid, ao_funcs_t *pf) { #define ANG_X_OFFSET 3 #define ANG_XY_OFFSET 13 #define ANG_Y_OFFSET 25 int i; intra_pred_t *ipred = pf->intraf; pf->fill_edge_f[0] = fill_reference_samples_0_c; pf->fill_edge_f[1] = fill_reference_samples_x_c; pf->fill_edge_f[2] = fill_reference_samples_y_c; pf->fill_edge_f[3] = fill_reference_samples_xy_c; ipred[DC_PRED ] = intra_pred_dc_c; // 0 ipred[PLANE_PRED] = intra_pred_plane_c; // 1 ipred[BI_PRED ] = intra_pred_bilinear_c; // 2 for (i = ANG_X_OFFSET; i < VERT_PRED; i++) { ipred[i ] = intra_pred_ang_x_c; // 3 ~ 11 } ipred[VERT_PRED ] = intra_pred_ver_c; // 12 for (i = ANG_XY_OFFSET; i < HOR_PRED; i++) { ipred[i ] = intra_pred_ang_xy_c; // 13 ~ 23 } ipred[HOR_PRED ] = intra_pred_hor_c; // 24 for (i = ANG_Y_OFFSET; i < NUM_INTRA_MODE; i++) { ipred[i ] = intra_pred_ang_y_c; // 25 ~ 32 } ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_c; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_c; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_c; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_c; ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_c; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_c; ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_c; ipred[INTRA_ANG_X_10] = intra_pred_ang_x_10_c; ipred[INTRA_ANG_X_11] = intra_pred_ang_x_11_c; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_c; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_c; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_c; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_c; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_c; ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_c; ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_c; ipred[INTRA_ANG_Y_25] = intra_pred_ang_y_25_c; ipred[INTRA_ANG_Y_26] = intra_pred_ang_y_26_c; ipred[INTRA_ANG_Y_27] = intra_pred_ang_y_27_c; ipred[INTRA_ANG_Y_28] = intra_pred_ang_y_28_c; ipred[INTRA_ANG_Y_29] = intra_pred_ang_y_29_c; ipred[INTRA_ANG_Y_30] = intra_pred_ang_y_30_c; ipred[INTRA_ANG_Y_31] = intra_pred_ang_y_31_c; ipred[INTRA_ANG_Y_32] = intra_pred_ang_y_32_c; #if HAVE_MMX if (cpuid & DAVS2_CPU_SSE4) { #if !HIGH_BIT_DEPTH ipred[DC_PRED ] = intra_pred_dc_sse128; ipred[PLANE_PRED] = intra_pred_plane_sse128; ipred[BI_PRED ] = intra_pred_bilinear_sse128; ipred[HOR_PRED ] = intra_pred_hor_sse128; ipred[VERT_PRED ] = intra_pred_ver_sse128; ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_sse128; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_sse128; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_sse128; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_sse128; ipred[INTRA_ANG_X_10 ] = intra_pred_ang_x_10_sse128; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_sse128; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_sse128; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_sse128; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_sse128; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_sse128; //ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_sse128; //ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_sse128; //ipred[INTRA_ANG_X_11 ] = intra_pred_ang_x_11_sse128; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_sse128; ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_sse128; ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_sse128; ipred[INTRA_ANG_Y_25 ] = intra_pred_ang_y_25_sse128; ipred[INTRA_ANG_Y_26 ] = intra_pred_ang_y_26_sse128; ipred[INTRA_ANG_Y_28 ] = intra_pred_ang_y_28_sse128; ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_sse128; ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_sse128; ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_sse128; pf->fill_edge_f[0] = fill_edge_samples_0_sse128; pf->fill_edge_f[1] = fill_edge_samples_x_sse128; pf->fill_edge_f[2] = fill_edge_samples_y_sse128; pf->fill_edge_f[3] = fill_edge_samples_xy_sse128; #endif } /* 8/10bit assemble*/ if (cpuid & DAVS2_CPU_AVX2 ) { #if !HIGH_BIT_DEPTH ipred[DC_PRED ] = intra_pred_dc_avx; ipred[HOR_PRED ] = intra_pred_hor_avx; ipred[VERT_PRED ] = intra_pred_ver_avx; ipred[PLANE_PRED ] = intra_pred_plane_avx; ipred[BI_PRED ] = intra_pred_bilinear_avx; ipred[INTRA_ANG_X_3 ] = intra_pred_ang_x_3_avx; ipred[INTRA_ANG_X_4 ] = intra_pred_ang_x_4_avx; ipred[INTRA_ANG_X_5 ] = intra_pred_ang_x_5_avx; ipred[INTRA_ANG_X_6 ] = intra_pred_ang_x_6_avx; //ipred[INTRA_ANG_X_7 ] = intra_pred_ang_x_7_avx; ipred[INTRA_ANG_X_8 ] = intra_pred_ang_x_8_avx; //ipred[INTRA_ANG_X_9 ] = intra_pred_ang_x_9_avx; ipred[INTRA_ANG_X_10 ] = intra_pred_ang_x_10_avx; //ipred[INTRA_ANG_X_11 ] = intra_pred_ang_x_11_avx; ipred[INTRA_ANG_XY_13] = intra_pred_ang_xy_13_avx; ipred[INTRA_ANG_XY_14] = intra_pred_ang_xy_14_avx; ipred[INTRA_ANG_XY_16] = intra_pred_ang_xy_16_avx; ipred[INTRA_ANG_XY_18] = intra_pred_ang_xy_18_avx; ipred[INTRA_ANG_XY_20] = intra_pred_ang_xy_20_avx; #if _MSC_VER // TODO: 20180206 cause unextended exit on Linux ipred[INTRA_ANG_XY_22] = intra_pred_ang_xy_22_avx; #endif ipred[INTRA_ANG_XY_23] = intra_pred_ang_xy_23_avx; ipred[INTRA_ANG_Y_25 ] = intra_pred_ang_y_25_avx; ipred[INTRA_ANG_Y_26 ] = intra_pred_ang_y_26_avx; ipred[INTRA_ANG_Y_28 ] = intra_pred_ang_y_28_avx; ipred[INTRA_ANG_Y_30 ] = intra_pred_ang_y_30_avx; ipred[INTRA_ANG_Y_31 ] = intra_pred_ang_y_31_avx; ipred[INTRA_ANG_Y_32 ] = intra_pred_ang_y_32_avx; #endif } #else UNUSED_PARAMETER(cpuid); #endif //if HAVE_MMX #undef ANG_X_OFFSET #undef ANG_XY_OFFSET #undef ANG_Y_OFFSET } davs2-1.6/source/common/intra.h000066400000000000000000000047051337322544400164720ustar00rootroot00000000000000/* * intra.h * * Description of this file: * Intra prediction functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_INTRA_H #define DAVS2_INTRA_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void intra_pred(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsy, int bsx, int i_avail) { if (dir_mode != DC_PRED) { gf_davs2.intraf[dir_mode](src, dst, i_dst, dir_mode, bsx, bsy); } else { int b_top = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_TOP); int b_left = !!IS_NEIGHBOR_AVAIL(i_avail, MD_I_LEFT); int mode_ex = ((b_top << 8) + b_left); gf_davs2.intraf[dir_mode](src, dst, i_dst, mode_ex, bsx, bsy); } } #define davs2_intra_pred_init FPFX(intra_pred_init) void davs2_intra_pred_init(uint32_t cpuid, ao_funcs_t *pf); #define davs2_get_intra_pred FPFX(get_intra_pred) void davs2_get_intra_pred(davs2_row_rec_t *row_rec, cu_t *p_cu, int predmode, int ctu_x, int ctu_y, int bsx, int bsy); #define davs2_get_intra_pred_chroma FPFX(get_intra_pred_chroma) void davs2_get_intra_pred_chroma(davs2_row_rec_t *h, cu_t *p_cu, int ctu_c_x, int ctu_c_y); #ifdef __cplusplus } #endif #endif // DAVS2_INTRA_H davs2-1.6/source/common/mc.cc000066400000000000000000000674351337322544400161230ustar00rootroot00000000000000/* * mc.cc * * Description of this file: * MC functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include #include #include "common.h" #include "mc.h" #if HAVE_MMX #include "vec/intrinsic.h" #include "x86/ipfilter8.h" #endif #if defined(_MSC_VER) || defined(__INTEL_COMPILER) /* --------------------------------------------------------------------------- * disable warning C4127: conditional expression is constant. */ #pragma warning(disable: 4127) #endif /** * =========================================================================== * local & global variables * =========================================================================== */ /* --------------------------------------------------------------------------- * interpolate filter (luma) */ ALIGN16(static const int8_t INTPL_FILTERS[4][8]) = { { 0, 0, 0, 64, 0, 0, 0, 0 }, /* for full-pixel, no use */ { -1, 4, -10, 57, 19, -7, 3, -1 }, { -1, 4, -11, 40, 40, -11, 4, -1 }, { -1, 3, -7, 19, 57, -10, 4, -1 } }; /* --------------------------------------------------------------------------- * interpolate filter (chroma) */ ALIGN16(static const int8_t INTPL_FILTERS_C[8][4]) = { { 0, 64, 0, 0 }, /* for full-pixel, no use */ { -4, 62, 6, 0 }, { -6, 56, 15, -1 }, { -5, 47, 25, -3 }, { -4, 36, 36, -4 }, { -3, 25, 47, -5 }, { -1, 15, 56, -6 }, { 0, 6, 62, -4 } }; /** * =========================================================================== * macros * =========================================================================== */ /* --------------------------------------------------------------------------- * for luma interpolating (horizontal) */ #define FLT_8TAP_HOR(src, i, coef) (\ (src)[i - 3] * (coef)[0] + \ (src)[i - 2] * (coef)[1] + \ (src)[i - 1] * (coef)[2] + \ (src)[i ] * (coef)[3] + \ (src)[i + 1] * (coef)[4] + \ (src)[i + 2] * (coef)[5] + \ (src)[i + 3] * (coef)[6] + \ (src)[i + 4] * (coef)[7]) /* --------------------------------------------------------------------------- * for luma interpolating (vertical) */ #define FLT_8TAP_VER(src, i, i_src, coef) (\ (src)[i - 3 * i_src] * (coef)[0] + \ (src)[i - 2 * i_src] * (coef)[1] + \ (src)[i - i_src] * (coef)[2] + \ (src)[i ] * (coef)[3] + \ (src)[i + i_src] * (coef)[4] + \ (src)[i + 2 * i_src] * (coef)[5] + \ (src)[i + 3 * i_src] * (coef)[6] + \ (src)[i + 4 * i_src] * (coef)[7]) /* --------------------------------------------------------------------------- * for chroma interpolating (horizontal) */ #define FLT_4TAP_HOR(src, i, coef) (\ (src)[i - 1] * (coef)[0] + \ (src)[i ] * (coef)[1] + \ (src)[i + 1] * (coef)[2] + \ (src)[i + 2] * (coef)[3]) /* --------------------------------------------------------------------------- * for chroma interpolating (vertical) */ #define FLT_4TAP_VER(src, i, i_src, coef) (\ (src)[i - i_src] * (coef)[0] + \ (src)[i ] * (coef)[1] + \ (src)[i + i_src] * (coef)[2] + \ (src)[i + 2 * i_src] * (coef)[3]) /** * =========================================================================== * interpolate * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void mc_block_copy_c(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) { while (h--) { memcpy(dst, src, w * sizeof(pel_t)); dst += i_dst; src += i_src; } } /* --------------------------------------------------------------------------- */ static void mc_block_copy_sc_c(coeff_t *dst, intptr_t i_dst, int16_t *src, intptr_t i_src, int w, int h) { int i; if (sizeof(coeff_t) == sizeof(int16_t)) { while (h--) { memcpy(dst, src, w * sizeof(coeff_t)); dst += i_dst; src += i_src; } } else { while (h--) { for (i = 0; i < w; i++) { dst[i] = src[i]; } dst += i_dst; src += i_src; } } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_HOR(src, x, coeff) + 32) >> 6; dst[x] = (pel_t)DAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(src, x, i_src, coeff) + 32) >> 6; dst[x] = (pel_t)DAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_chroma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { // TODO: lumaͳһ ALIGN16(int32_t tmp_res[(32 + 3) * 32]); int32_t *tmp = tmp_res; const int shift1 = g_bit_depth - 8; const int add1 = (1 << shift1) >> 1; const int shift2 = 20 - g_bit_depth; const int add2 = 1 << (shift2 - 1); // 1<<(19-g_bit_depth) int x, y, v; src -= i_src; for (y = -1; y < height + 2; y++) { for (x = 0; x < width; x++) { v = FLT_4TAP_HOR(src, x, coeff_h); tmp[x] = (v + add1) >> shift1; } src += i_src; tmp += 32; } tmp = tmp_res + 32; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_4TAP_VER(tmp, x, 32, coeff_v) + add2) >> shift2; dst[x] = (pel_t)DAVS2_CLIP1(v); } dst += i_dst; tmp += 32; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_block_hor_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_HOR(src, x, coeff) + 32) >> 6; dst[x] = (pel_t)DAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_block_ver_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int x, y, v; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_VER(src, x, i_src, coeff) + 32) >> 6; dst[x] = (pel_t)DAVS2_CLIP1(v); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void intpl_luma_block_ext_c(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v) { #define TMP_STRIDE 64 const int shift1 = g_bit_depth - 8; const int add1 = (1 << shift1) >> 1; const int shift2 = 20 - g_bit_depth; const int add2 = 1 << (shift2 - 1);//1<<(19-bit_depth) ALIGN16(mct_t tmp_buf[(64 + 7) * TMP_STRIDE]); mct_t *tmp = tmp_buf; int x, y, v; src -= 3 * i_src; for (y = -3; y < height + 4; y++) { for (x = 0; x < width; x++) { v = FLT_8TAP_HOR(src, x, coeff_h); tmp[x] = (mct_t)((v + add1) >> shift1); } src += i_src; tmp += TMP_STRIDE; } tmp = tmp_buf + 3 * TMP_STRIDE; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { v = (FLT_8TAP_VER(tmp, x, TMP_STRIDE, coeff_v) + add2) >> shift2; dst[x] = (pel_t)DAVS2_CLIP1(v); } dst += i_dst; tmp += TMP_STRIDE; } #undef TMP_STRIDE } /* --------------------------------------------------------------------------- */ #define INTERP_HOR_C(width, height) \ static void interp_horiz_pp_##width##x##height##_c(const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx) \ { \ const int N = 8; /* 8-tap Luma interpolation */ \ const int8_t* coeff = (N == 4) ? INTPL_FILTERS_C[coeffIdx] : INTPL_FILTERS[coeffIdx]; \ int headRoom = 6; /* Log2 of sum of filter taps */ \ int offset = (1 << (headRoom - 1)); \ uint16_t maxVal = (1 << BIT_DEPTH) - 1; \ int cStride = 1; \ src -= (N / 2 - 1) * cStride; \ int row, col; \ for (row = 0; row < height; row++) { \ for (col = 0; col < width; col++) { \ int sum = src[col + 0 * cStride] * coeff[0]; \ sum += src[col + 1 * cStride] * coeff[1]; \ sum += src[col + 2 * cStride] * coeff[2]; \ sum += src[col + 3 * cStride] * coeff[3]; \ if (N == 8) { \ sum += src[col + 4 * cStride] * coeff[4]; \ sum += src[col + 5 * cStride] * coeff[5]; \ sum += src[col + 6 * cStride] * coeff[6]; \ sum += src[col + 7 * cStride] * coeff[7]; \ } \ int16_t val = (int16_t)((sum + offset) >> headRoom); \ val = DAVS2_CLIP3(0, maxVal, val); \ dst[col] = (pel_t)val; \ } \ src += srcStride; \ dst += dstStride; \ } \ } /* --------------------------------------------------------------------------- */ #define INTERP_PS_HOR_C(width, height) \ static void interp_horiz_ps_##width##x##height##_c(const pel_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx) \ { \ const int N = 8; /* 8-tap Luma interpolation */ \ const int8_t* coeff = (N == 4) ? INTPL_FILTERS_C[coeffIdx] : INTPL_FILTERS[coeffIdx]; \ int headRoom = 6; /* Log2 of sum of filter taps */ \ int offset = (1 << (headRoom - 1)); \ uint16_t maxVal = (1 << BIT_DEPTH) - 1; \ int cStride = 1; \ src -= (N / 2 - 1) * cStride; \ int row, col; \ for (row = 0; row < height; row++) { \ for (col = 0; col < width; col++) { \ int sum = src[col + 0 * cStride] * coeff[0]; \ sum += src[col + 1 * cStride] * coeff[1]; \ sum += src[col + 2 * cStride] * coeff[2]; \ sum += src[col + 3 * cStride] * coeff[3]; \ if (N == 8) { \ sum += src[col + 4 * cStride] * coeff[4]; \ sum += src[col + 5 * cStride] * coeff[5]; \ sum += src[col + 6 * cStride] * coeff[6]; \ sum += src[col + 7 * cStride] * coeff[7]; \ } \ int16_t val = (int16_t)((sum + offset) >> headRoom); \ val = DAVS2_CLIP3(0, maxVal, val); \ dst[col] = (pel_t)val; \ } \ src += srcStride; \ dst += dstStride; \ } \ } /* --------------------------------------------------------------------------- */ #define INTERP_VER_C(width, height) \ static void interp_vert_pp_##width##x##height##_c(const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx) \ { \ const int N = 8; /* 8-tap Luma interpolation */ \ const int8_t* c = (N == 4) ? INTPL_FILTERS_C[coeffIdx] : INTPL_FILTERS[coeffIdx]; \ int shift = 6; \ int offset = 1 << (shift - 1); \ uint16_t maxVal = (1 << BIT_DEPTH) - 1; \ src -= (N / 2 - 1) * srcStride; \ int row, col; \ for (row = 0; row < height; row++) { \ for (col = 0; col < width; col++) { \ int sum = src[col + 0 * srcStride] * c[0]; \ sum += src[col + 1 * srcStride] * c[1]; \ sum += src[col + 2 * srcStride] * c[2]; \ sum += src[col + 3 * srcStride] * c[3]; \ if (N == 8) { \ sum += src[col + 4 * srcStride] * c[4]; \ sum += src[col + 5 * srcStride] * c[5]; \ sum += src[col + 6 * srcStride] * c[6]; \ sum += src[col + 7 * srcStride] * c[7]; \ } \ int16_t val = (int16_t)((sum + offset) >> shift); \ val = DAVS2_CLIP3(0, maxVal, val); \ dst[col] = (pel_t)val; \ } \ src += srcStride; \ dst += dstStride; \ } \ } /* --------------------------------------------------------------------------- */ #define INTERP_SP_VER_C(w, h) \ static void filterVertical_sp_##w##x##h##_c(const int16_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx) \ { \ const int N = 8; /* 8-tap Luma interpolation */ \ int headRoom = 14 - BIT_DEPTH; \ int shift = 6 + headRoom; \ int offset = (1 << (shift - 1)) + ((1 << 13) << 6); \ const int8_t* c = (N == 8 ? INTPL_FILTERS_C[coeffIdx] : INTPL_FILTERS[coeffIdx]); \ int16_t maxVal = (1 << BIT_DEPTH) - 1; \ src -= (N / 2 - 1) * srcStride; \ int row, col; \ for (row = 0; row < h; row++) { \ for (col = 0; col < w; col++) { \ int sum = src[col + 0 * srcStride] * c[0]; \ sum += src[col + 1 * srcStride] * c[1]; \ sum += src[col + 2 * srcStride] * c[2]; \ sum += src[col + 3 * srcStride] * c[3]; \ if (N == 8) { \ sum += src[col + 4 * srcStride] * c[4]; \ sum += src[col + 5 * srcStride] * c[5]; \ sum += src[col + 6 * srcStride] * c[6]; \ sum += src[col + 7 * srcStride] * c[7]; \ } \ int16_t val = (int16_t)((sum + offset) >> shift); \ val = DAVS2_CLIP3(0, maxVal, val); \ dst[col] = (pel_t)val; \ } \ src += srcStride; \ dst += dstStride; \ } \ } /* --------------------------------------------------------------------------- */ #define INTERP_EXT_C(width, height) \ static void interp_hv_pp_##width##x##height##_c(const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int idxX, int idxY) \ { \ int16_t immedVals[(64 + 8) * (64 + 8)]; \ interp_horiz_ps_##width##x##height##_c(src, srcStride, immedVals, width, idxX); \ filterVertical_sp_##width##x##height##_c(immedVals + 3 * width, width, dst, dstStride, idxY); \ } #define INTPL_OP_C(w, h) \ INTERP_HOR_C(w, h) \ INTERP_PS_HOR_C(w, h) \ INTERP_VER_C(w, h) \ INTERP_SP_VER_C(w, h) \ INTERP_EXT_C(w, h) //INTPL_OP_C(64, 64) /* 64x64 */ //INTPL_OP_C(64, 32) //INTPL_OP_C(32, 64) //INTPL_OP_C(64, 16) //INTPL_OP_C(64, 48) //INTPL_OP_C(16, 64) //INTPL_OP_C(48, 64) //INTPL_OP_C(32, 32) /* 32x32 */ //INTPL_OP_C(32, 16) //INTPL_OP_C(16, 32) //INTPL_OP_C(32, 8) //INTPL_OP_C(32, 24) //INTPL_OP_C(8, 32) //INTPL_OP_C(24, 32) //INTPL_OP_C(16, 16) /* 16x16 */ //INTPL_OP_C(16, 8) //INTPL_OP_C(8, 16) //INTPL_OP_C(16, 4) //INTPL_OP_C(16, 12) //INTPL_OP_C(4, 16) //INTPL_OP_C(12, 16) //INTPL_OP_C(8, 8) /* 8x8 */ //INTPL_OP_C(8, 4) //INTPL_OP_C(4, 8) //INTPL_OP_C(4, 4) /* 4x4 */ /* --------------------------------------------------------------------------- * interpolation of 1/4 subpixel * A dst 1 src B * c d e f * 2 h 3 i * j k l m * C D */ void mc_luma(davs2_t *h, pel_t *dst, int i_dst, int posx, int posy, int width, int height, pel_t *p_fref, int i_fref) { const int dx = posx & 3; const int dy = posy & 3; const int mc_part_index = MC_PART_INDEX(width, height); pel_t *fref; UNUSED_PARAMETER(h); posx >>= 2; posy >>= 2; fref = p_fref; p_fref += posy * i_fref + posx; if (dx == 0 && dy == 0) { gf_davs2.copy_pp[PART_INDEX(width, height)](dst, i_dst, p_fref, i_fref); } else if (dx == 0) { #if USE_NEW_INTPL gf_davs2.block_intpl_luma_ver[PART_INDEX(width, height)](p_fref, i_fref, dst, i_dst, dy); #else gf_davs2.intpl_luma_ver[mc_part_index][dy - 1](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS[dy]); #endif } else if (dy == 0) { #if USE_NEW_INTPL gf_davs2.block_intpl_luma_hor[PART_INDEX(width, height)](p_fref, i_fref, dst, i_dst, dx); #else gf_davs2.intpl_luma_hor[mc_part_index][dx - 1](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS[dx]); #endif } else { gf_davs2.intpl_luma_ext[mc_part_index](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS[dx], INTPL_FILTERS[dy]); } } /* --------------------------------------------------------------------------- */ void mc_chroma(davs2_t *h, pel_t *dst, int i_dst, int posx, int posy, int width, int height, pel_t *p_fref, int i_fref) { const int dx = posx & 7; const int dy = posy & 7; const int mc_part_index = MC_PART_INDEX(width, height); UNUSED_PARAMETER(h); posx >>= 3; posy >>= 3; p_fref += posy * i_fref + posx; if (dx == 0 && dy == 0) { if (width != 2 && width != 6 && height != 2 && height != 6) { gf_davs2.copy_pp[PART_INDEX(width, height)](dst, i_dst, p_fref, i_fref); } else { gf_davs2.block_copy(dst, i_dst, p_fref, i_fref, width, height); } } else if (dx == 0) { gf_davs2.intpl_chroma_ver[mc_part_index](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS_C[dy]); } else if (dy == 0) { gf_davs2.intpl_chroma_hor[mc_part_index](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS_C[dx]); } else { gf_davs2.intpl_chroma_ext[mc_part_index](dst, i_dst, p_fref, i_fref, width, height, INTPL_FILTERS_C[dx], INTPL_FILTERS_C[dy]); } } /** * =========================================================================== * pixel block average * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void davs2_pixel_average_c(pel_t *dst, int i_dst, const pel_t *src0, int i_src0, const pel_t *src1, int i_src1, int width, int height) { int i, j; for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { dst[j] = (pel_t)((src0[j] + src1[j] + 1) >> 1); } dst += i_dst; src0 += i_src0; src1 += i_src1; } } /** * =========================================================================== * plane copy * =========================================================================== */ /* --------------------------------------------------------------------------- */ #define plane_copy_c mc_block_copy_c #define ALL_LUMA_PU(name1, name2, cpu) \ pf->name1[PART_64x64] = name2 ## _64x64 ##_## cpu; /* 64x64 */ \ pf->name1[PART_64x32] = name2 ## _64x32 ##_## cpu;\ pf->name1[PART_32x64] = name2 ## _32x64 ##_## cpu;\ pf->name1[PART_64x16] = name2 ## _64x16 ##_## cpu;\ pf->name1[PART_64x48] = name2 ## _64x48 ##_## cpu;\ pf->name1[PART_16x64] = name2 ## _16x64 ##_## cpu;\ pf->name1[PART_48x64] = name2 ## _48x64 ##_## cpu;\ pf->name1[PART_32x32] = name2 ## _32x32 ##_## cpu; /* 32x32 */ \ pf->name1[PART_32x16] = name2 ## _32x16 ##_## cpu;\ pf->name1[PART_16x32] = name2 ## _16x32 ##_## cpu;\ pf->name1[PART_32x8 ] = name2 ## _32x8 ##_## cpu;\ pf->name1[PART_32x24] = name2 ## _32x24 ##_## cpu;\ pf->name1[PART_8x32 ] = name2 ## _8x32 ##_## cpu;\ pf->name1[PART_24x32] = name2 ## _24x32 ##_## cpu;\ pf->name1[PART_16x16] = name2 ## _16x16 ##_## cpu; /* 16x16 */ \ pf->name1[PART_16x8 ] = name2 ## _16x8 ##_## cpu;\ pf->name1[PART_8x16 ] = name2 ## _8x16 ##_## cpu;\ pf->name1[PART_16x4 ] = name2 ## _16x4 ##_## cpu;\ pf->name1[PART_16x12] = name2 ## _16x12 ##_## cpu;\ pf->name1[PART_4x16 ] = name2 ## _4x16 ##_## cpu;\ pf->name1[PART_12x16] = name2 ## _12x16 ##_## cpu;\ pf->name1[PART_8x8 ] = name2 ## _8x8 ##_## cpu; /* 8x8 */ \ pf->name1[PART_8x4 ] = name2 ## _8x4 ##_## cpu;\ pf->name1[PART_4x8 ] = name2 ## _4x8 ##_## cpu;\ pf->name1[PART_4x4 ] = name2 ## _4x4 ##_## cpu /* 4x4 */ /** * =========================================================================== * interface function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ void davs2_mc_init(uint32_t cpuid, ao_funcs_t *pf) { UNUSED_PARAMETER(cpuid); /* plane copy */ pf->plane_copy = plane_copy_c; pf->block_copy = mc_block_copy_c; pf->block_coeff_copy = mc_block_copy_sc_c; /* block average */ pf->block_avg = davs2_pixel_average_c; /* interpolate */ #if USE_NEW_INTPL ALL_LUMA_PU(block_intpl_luma_hor, interp_horiz_pp, c); ALL_LUMA_PU(block_intpl_luma_ver, interp_vert_pp, c); ALL_LUMA_PU(block_intpl_luma_ext, interp_hv_pp, c); #endif pf->intpl_luma_ver[0][0] = intpl_luma_block_ver_c; pf->intpl_luma_ver[0][1] = intpl_luma_block_ver_c; pf->intpl_luma_ver[0][2] = intpl_luma_block_ver_c; pf->intpl_luma_hor[0][0] = intpl_luma_block_hor_c; pf->intpl_luma_hor[0][1] = intpl_luma_block_hor_c; pf->intpl_luma_hor[0][2] = intpl_luma_block_hor_c; pf->intpl_luma_ext[0] = intpl_luma_block_ext_c; pf->intpl_chroma_ver[0] = intpl_chroma_block_ver_c; pf->intpl_chroma_hor[0] = intpl_chroma_block_hor_c; pf->intpl_chroma_ext[0] = intpl_chroma_block_ext_c; pf->intpl_luma_ver[1][0] = intpl_luma_block_ver_c; pf->intpl_luma_ver[1][1] = intpl_luma_block_ver_c; pf->intpl_luma_ver[1][2] = intpl_luma_block_ver_c; pf->intpl_luma_hor[1][0] = intpl_luma_block_hor_c; pf->intpl_luma_hor[1][1] = intpl_luma_block_hor_c; pf->intpl_luma_hor[1][2] = intpl_luma_block_hor_c; pf->intpl_luma_ext[1] = intpl_luma_block_ext_c; pf->intpl_chroma_ver[1] = intpl_chroma_block_ver_c; pf->intpl_chroma_hor[1] = intpl_chroma_block_hor_c; pf->intpl_chroma_ext[1] = intpl_chroma_block_ext_c; /* init asm function handles */ #if HAVE_MMX if (cpuid & DAVS2_CPU_SSE42) { #if HIGH_BIT_DEPTH //10bit assemble #else #if USE_NEW_INTPL ALL_LUMA_PU(block_intpl_luma_hor, davs2_interp_8tap_horiz_pp, sse4); pf->block_intpl_luma_hor[PART_4x4] = davs2_interp_8tap_horiz_pp_4x4_sse4; ALL_LUMA_PU(block_intpl_luma_ver, davs2_interp_8tap_vert_pp, sse4); pf->block_intpl_luma_ver[PART_4x4] = davs2_interp_8tap_vert_pp_4x4_sse4; /* linking error */ // ALL_LUMA_PU(block_intpl_luma_ext, davs2_interp_8tap_hv_pp, sse4); // pf->block_intpl_luma_ext[PART_4x4] = davs2_interp_8tap_hv_pp_4x4_sse4; #endif #endif //if HIGH_BIT_DEPTH } if (cpuid & DAVS2_CPU_SSE2) { /* memory copy */ pf->plane_copy = plane_copy_c_sse2; } if (cpuid & DAVS2_CPU_SSE4) { /* block average */ pf->block_avg = avs_pixel_average_sse128; #if !HIGH_BIT_DEPTH /* interpolate */ pf->intpl_luma_hor[0][0] = intpl_luma_block_hor_sse128; pf->intpl_luma_hor[0][1] = intpl_luma_block_hor_sse128; pf->intpl_luma_hor[0][2] = intpl_luma_block_hor_sse128; pf->intpl_luma_ext[0] = intpl_luma_block_ext_sse128; /*ֵвƥ⡣ ޸ʱעرavx2ຯ */ //pf->intpl_chroma_ver[0] = intpl_chroma_block_ver_sse128; pf->intpl_chroma_hor[0] = intpl_chroma_block_hor_sse128; pf->intpl_chroma_ext[0] = intpl_chroma_block_ext_sse128; pf->intpl_luma_hor[1][0] = intpl_luma_block_hor_sse128; pf->intpl_luma_hor[1][1] = intpl_luma_block_hor_sse128; pf->intpl_luma_hor[1][2] = intpl_luma_block_hor_sse128; pf->intpl_luma_ext[1] = intpl_luma_block_ext_sse128; //pf->intpl_chroma_ver[1] = intpl_chroma_block_ver_sse128; pf->intpl_chroma_hor[1] = intpl_chroma_block_hor_sse128; pf->intpl_chroma_ext[1] = intpl_chroma_block_ext_sse128; pf->intpl_luma_ver[0][0] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[0][1] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[0][2] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[1][0] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[1][1] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[1][2] = intpl_luma_block_ver_sse128; pf->intpl_luma_ver[0][0] = intpl_luma_block_ver0_sse128; pf->intpl_luma_ver[0][1] = intpl_luma_block_ver1_sse128; pf->intpl_luma_ver[0][2] = intpl_luma_block_ver2_sse128; pf->intpl_luma_ver[1][0] = intpl_luma_block_ver0_sse128; pf->intpl_luma_ver[1][1] = intpl_luma_block_ver1_sse128; pf->intpl_luma_ver[1][2] = intpl_luma_block_ver2_sse128; #endif } if (cpuid & DAVS2_CPU_AVX2) { #if !HIGH_BIT_DEPTH pf->intpl_luma_hor[1][0] = intpl_luma_block_hor_avx2; pf->intpl_luma_hor[1][1] = intpl_luma_block_hor_avx2; pf->intpl_luma_hor[1][2] = intpl_luma_block_hor_avx2; pf->intpl_luma_ext[1] = intpl_luma_block_ext_avx2; pf->intpl_chroma_ver[1] = intpl_chroma_block_ver_avx2; pf->intpl_chroma_hor[1] = intpl_chroma_block_hor_avx2; pf->intpl_chroma_ext[1] = intpl_chroma_block_ext_avx2; pf->intpl_luma_ver[1][0] = intpl_luma_block_ver_avx2; pf->intpl_luma_ver[1][1] = intpl_luma_block_ver_avx2; pf->intpl_luma_ver[1][2] = intpl_luma_block_ver_avx2; pf->intpl_luma_ver[1][0] = intpl_luma_block_ver0_avx2; pf->intpl_luma_ver[1][1] = intpl_luma_block_ver1_avx2; pf->intpl_luma_ver[1][2] = intpl_luma_block_ver2_avx2; #endif } #endif //if HAVE_MMX } davs2-1.6/source/common/mc.h000066400000000000000000000033311337322544400157460ustar00rootroot00000000000000/* * mc.h * * Description of this file: * MC functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_MC_H #define DAVS2_MC_H #ifdef __cplusplus extern "C" { #endif #define mc_luma FPFX(mc_luma) void mc_luma (davs2_t *h, pel_t *dst, int i_dst, int posx, int posy, int width, int height, pel_t *p_fref, int i_fref); #define mc_chroma FPFX(mc_chroma) void mc_chroma(davs2_t *h, pel_t *dst, int i_dst, int posx, int posy, int width, int height, pel_t *p_fref, int i_fref); #ifdef __cplusplus } #endif #endif // DAVS2_MC_H davs2-1.6/source/common/memory.cc000066400000000000000000000053401337322544400170170ustar00rootroot00000000000000/* * memory.cc * * Description of this file: * Memory functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #if HAVE_MMX #include "vec/intrinsic.h" #endif /* --------------------------------------------------------------------------- */ void *memzero_aligned_c(void *dst, size_t n) { return memset(dst, 0, n); } /* --------------------------------------------------------------------------- */ void davs2_memory_init(uint32_t cpuid, ao_funcs_t* pf) { /* memory copy */ pf->fast_memcpy = memcpy; pf->fast_memset = memset; pf->memcpy_aligned = memcpy; pf->fast_memzero = memzero_aligned_c; pf->memzero_aligned = memzero_aligned_c; /* init asm function handles */ #if HAVE_MMX if (cpuid & DAVS2_CPU_MMX) { pf->fast_memcpy = davs2_fast_memcpy_mmx; pf->memcpy_aligned = davs2_memcpy_aligned_mmx; pf->fast_memset = davs2_fast_memset_mmx; pf->fast_memzero = davs2_fast_memzero_mmx; pf->memzero_aligned = davs2_fast_memzero_mmx; } if (cpuid & DAVS2_CPU_SSE) { // pf->memcpy_aligned = davs2_memcpy_aligned_sse; // pf->memzero_aligned = davs2_memzero_aligned_sse; } if (cpuid & DAVS2_CPU_SSE2) { pf->memzero_aligned = davs2_memzero_aligned_c_sse2; // gf_davs2.memcpy_aligned = davs2_memcpy_aligned_c_sse2; } if (cpuid & DAVS2_CPU_AVX2) { pf->memzero_aligned = davs2_memzero_aligned_c_avx; } #endif // HAVE_MMX } davs2-1.6/source/common/osdep.h000066400000000000000000000304451337322544400164670ustar00rootroot00000000000000/* * osdep.h * * Description of this file: * platform-specific code functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_OSDEP_H #define DAVS2_OSDEP_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- * disable warning C4996: functions or variables may be unsafe. */ #if defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #define _CRT_NONSTDC_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_WARNINGS #pragma warning(disable:4324) /* disable warning C4324: __declspec(align())ṹ */ #endif /** * =========================================================================== * includes * =========================================================================== */ #define _LARGEFILE_SOURCE 1 #define _FILE_OFFSET_BITS 64 #include #include #include /* --------------------------------------------------------------------------- * disable warning C4996: functions or variables may be unsafe. */ #if defined(_MSC_VER) #include #include #endif #if defined(__ICL) || defined(_MSC_VER) #include "configw.h" #else #include "config.h" #endif #if HAVE_STDINT_H #include #else #include #endif #if defined(__INTEL_COMPILER) #include #else #include #endif #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #endif /* disable warning C4100: : unreferenced formal parameter */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define UNUSED_PARAMETER(v) (v) /* same as UNREFERENCED_PARAMETER */ #else #define UNUSED_PARAMETER(v) (void)(v) #endif /** * =========================================================================== * const defines * =========================================================================== */ /* --------------------------------------------------------------------------- * Specifies the number of bits per pixel that DAVS2 uses */ #define AVS2_BIT_DEPTH BIT_DEPTH #define WORD_SIZE sizeof(void*) /** * =========================================================================== * const defines * =========================================================================== */ #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) #define UNINIT(x) x = x #define UNUSED __attribute__((unused)) #define ALWAYS_INLINE __attribute__((always_inline)) inline #define NOINLINE __attribute__((noinline)) #define MAY_ALIAS __attribute__((may_alias)) #define davs2_constant_p(x) __builtin_constant_p(x) #define davs2_nonconstant_p(x) (!__builtin_constant_p(x)) #define INLINE __inline #else #define UNINIT(x) x #if defined(__ICL) #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #else #define ALWAYS_INLINE INLINE #define NOINLINE #endif #define UNUSED #define MAY_ALIAS #define davs2_constant_p(x) 0 #define davs2_nonconstant_p(x) 0 #endif /* string operations */ #if defined(__ICL) || defined(_MSC_VER) #define INLINE __inline #define strcasecmp _stricmp #define strncasecmp _strnicmp #define snprintf _snprintf #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #if !HAVE_POSIXTHREAD #define strtok_r strtok_s #endif #else #include #endif #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (ARCH_X86 || ARCH_X86_64) #ifndef HAVE_X86_INLINE_ASM #define HAVE_X86_INLINE_ASM 1 #endif #endif // #if defined(_WIN32) // /* POSIX says that rename() removes the destination, but win32 doesn't. */ // #define rename(src,dst) (unlink(dst), rename(src,dst)) // #if !HAVE_POSIXTHREAD // #ifndef strtok_r // #define strtok_r(str,delim,save) strtok(str, delim) // #endif // #endif // #endif /* --------------------------------------------------------------------------- * align */ /* align a pointer */ # define CACHE_LINE_SIZE 32 /* for x86-64 and x86 */ # define ALIGN_POINTER(p) (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_SIZE - 1)) & (~(intptr_t)(CACHE_LINE_SIZE - 1))) # define CACHE_LINE_256B 32 /* for x86-64 and x86 */ # define ALIGN_256_PTR(p) (p) = (uint8_t *)((intptr_t)((p) + (CACHE_LINE_256B - 1)) & (~(intptr_t)(CACHE_LINE_256B - 1))) #if defined(_MSC_VER) #define DECLARE_ALIGNED(var, n) __declspec(align(n)) var // #elif defined(__INTEL_COMPILER) // #define DECLARE_ALIGNED(var, n) var __declspec(align(n)) #else #define DECLARE_ALIGNED(var, n) var __attribute__((aligned(n))) #endif #define ALIGN32(var) DECLARE_ALIGNED(var, 32) #define ALIGN16(var) DECLARE_ALIGNED(var, 16) #define ALIGN8(var) DECLARE_ALIGNED(var, 8) // ARM compiliers don't reliably align stack variables // - EABI requires only 8 byte stack alignment to be maintained // - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function // - armcc can't either, but is nice enough to actually tell you so // - Apple gcc only maintains 4 byte alignment // - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils... #define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\ uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \ type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask) #if ARCH_ARM && SYS_MACOSX #define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_8( type, name, sub1, ... )\ ALIGN8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM #define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) #else #define ALIGNED_ARRAY_16( type, name, sub1, ... )\ ALIGN16( type name sub1 __VA_ARGS__ ) #endif #define EXPAND(x) x #if defined(STACK_ALIGNMENT) && STACK_ALIGNMENT >= 32 #define ALIGNED_ARRAY_32( type, name, sub1, ... )\ ALIGN32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) #endif #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) /* For AVX2 */ #if ARCH_X86 || ARCH_X86_64 #define NATIVE_ALIGN 32 #define ALIGNED_N ALIGN32 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 #else #define NATIVE_ALIGN 16 #define ALIGNED_N ALIGN16 #define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 #endif /* --------------------------------------------------------------------------- * threads */ #if HAVE_BEOSTHREAD #include #define davs2_thread_t thread_id static int ALWAYS_INLINE avs2dec_pthread_create(davs2_thread_t *t, void *a, void *(*f)(void *), void *d) { *t = spawn_thread(f, "", 10, d); if (*t < B_NO_ERROR) { return -1; } resume_thread(*t); return 0; } #define davs2_thread_join(t,s)\ {\ long tmp; \ wait_for_thread(t,(s)?(long*)(*(s)):&tmp);\ } #elif HAVE_POSIXTHREAD #if defined(_MSC_VER) || defined(__ICL) #if _MSC_VER >= 1900 #define HAVE_STRUCT_TIMESPEC 1 /* for struct timespec */ #endif #pragma comment(lib, "pthread_lib.lib") #endif #include #define davs2_thread_t pthread_t #define davs2_thread_create pthread_create #define davs2_thread_join pthread_join #define davs2_thread_exit pthread_exit #define davs2_thread_mutex_t pthread_mutex_t #define davs2_thread_mutex_init pthread_mutex_init #define davs2_thread_mutex_destroy pthread_mutex_destroy #define davs2_thread_mutex_lock pthread_mutex_lock #define davs2_thread_mutex_unlock pthread_mutex_unlock #define davs2_thread_cond_t pthread_cond_t #define davs2_thread_cond_init pthread_cond_init #define davs2_thread_cond_destroy pthread_cond_destroy #define davs2_thread_cond_signal pthread_cond_signal #define davs2_thread_cond_broadcast pthread_cond_broadcast #define davs2_thread_cond_wait pthread_cond_wait #define davs2_thread_attr_t pthread_attr_t #define davs2_thread_attr_init pthread_attr_init #define davs2_thread_attr_destroy pthread_attr_destroy #if defined(__ARM_ARCH_7A__) || SYS_LINUX #define davs2_thread_num_processors_np get_nprocs #else #define davs2_thread_num_processors_np pthread_num_processors_np #endif #define AVS2_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #elif HAVE_WIN32THREAD #include "win32thread.h" #else #define davs2_thread_t int #define davs2_thread_create(t,u,f,d) 0 #define davs2_thread_join(t,s) #endif //HAVE_*THREAD #if !HAVE_POSIXTHREAD && !HAVE_WIN32THREAD #define davs2_thread_mutex_t int #define davs2_thread_mutex_init(m,f) 0 #define davs2_thread_mutex_destroy(m) #define davs2_thread_mutex_lock(m) #define davs2_thread_mutex_unlock(m) #define davs2_thread_cond_t int #define davs2_thread_cond_init(c,f) 0 #define davs2_thread_cond_destroy(c) #define davs2_thread_cond_broadcast(c) #define davs2_thread_cond_wait(c,m) #define davs2_thread_attr_t int #define davs2_thread_attr_init(a) 0 #define davs2_thread_attr_destroy(a) #define AVS2_PTHREAD_MUTEX_INITIALIZER 0 #endif #if HAVE_WIN32THREAD || PTW32_STATIC_LIB int davs2_threading_init(void); #else #define davs2_threading_init() 0 #endif #if HAVE_POSIXTHREAD #if SYS_WINDOWS #define davs2_lower_thread_priority(p)\ {\ davs2_thread_t handle = pthread_self();\ struct sched_param sp;\ int policy = SCHED_OTHER;\ pthread_getschedparam(handle, &policy, &sp);\ sp.sched_priority -= p;\ pthread_setschedparam(handle, policy, &sp);\ } #else #include #define davs2_lower_thread_priority(p) { int nice_ret = nice(p); } #define davs2_thread_spin_init(plock,pshare) pthread_spin_init(plock, pshare) #endif /* SYS_WINDOWS */ #elif HAVE_WIN32THREAD #define davs2_lower_thread_priority(p) SetThreadPriority(GetCurrentThread(), DAVS2_MAX(-2, -p)) #else #define davs2_lower_thread_priority(p) #endif #if SYS_WINDOWS #define davs2_sleep_ms(x) Sleep(x) #else #define davs2_sleep_ms(x) usleep(x * 1000) #endif /** * =========================================================================== * inline functions * =========================================================================== */ static int ALWAYS_INLINE davs2_is_regular_file(int filehandle) { struct stat file_stat; if (fstat(filehandle, &file_stat)) { return -1; } return S_ISREG(file_stat.st_mode); } static int ALWAYS_INLINE davs2_is_regular_file_path(const char *filename) { struct stat file_stat; if (stat(filename, &file_stat)) { return -1; } return S_ISREG(file_stat.st_mode); } #ifdef __cplusplus } #endif #endif /* DAVS2_OSDEP_H */ davs2-1.6/source/common/pixel.cc000066400000000000000000000506041337322544400166330ustar00rootroot00000000000000/* * pixel.cc * * Description of this file: * Pixel processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "vec/intrinsic.h" #ifdef __cplusplus extern "C" { #endif /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ /* --------------------------------------------------------------------------- * partition map table */ const uint8_t g_partition_map_tab[] = { // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 PART_4x4, PART_4x8, 255, PART_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 PART_8x4, PART_8x8, 255, PART_8x16, 255, 255, 255, PART_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 255, 255, 255, PART_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 PART_16x4, PART_16x8, PART_16x12, PART_16x16, 255, 255, 255, PART_16x32, 255, 255, 255, 255, 255, 255, 255, PART_16x64, // 16 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 255, 255, 255, 255, 255, 255, 255, PART_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 255, PART_32x8, 255, PART_32x16, 255, PART_32x24, 255, PART_32x32, 255, 255, 255, 255, 255, 255, 255, PART_32x64, // 32 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, PART_48x64, // 48 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 255, 255, 255, PART_64x16, 255, 255, 255, PART_64x32, 255, 255, 255, PART_64x48, 255, 255, 255, PART_64x64 // 64 }; #define PIXEL_ADD_PS_C(w, h) \ static void davs2_pixel_add_ps_##w##x##h(pel_t *a, intptr_t dstride, const pel_t *b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = (pel_t)DAVS2_CLIP1(b0[x] + b1[x]);\ }\ b0 += sstride0;\ b1 += sstride1;\ a += dstride;\ }\ } #define BLOCKCOPY_PP_C(w, h) \ static void davs2_blockcopy_pp_##w##x##h(pel_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = b[x];\ }\ a += stridea;\ b += strideb;\ }\ } #define BLOCKCOPY_SS_C(w, h) \ static void davs2_blockcopy_ss_##w##x##h(coeff_t* a, intptr_t stridea, const coeff_t* b, intptr_t strideb)\ {\ int x, y;\ for (y = 0; y < h; y++) {\ for (x = 0; x < w; x++) {\ a[x] = b[x];\ }\ a += stridea;\ b += strideb;\ }\ } #define BLOCK_OP_C(w, h) \ PIXEL_ADD_PS_C(w, h); \ BLOCKCOPY_PP_C(w, h); \ BLOCKCOPY_SS_C(w, h); BLOCK_OP_C(64, 64) /* 64x64 */ BLOCK_OP_C(64, 32) BLOCK_OP_C(32, 64) BLOCK_OP_C(64, 16) BLOCK_OP_C(64, 48) BLOCK_OP_C(16, 64) BLOCK_OP_C(48, 64) BLOCK_OP_C(32, 32) /* 32x32 */ BLOCK_OP_C(32, 16) BLOCK_OP_C(16, 32) BLOCK_OP_C(32, 8) BLOCK_OP_C(32, 24) BLOCK_OP_C( 8, 32) BLOCK_OP_C(24, 32) BLOCK_OP_C(16, 16) /* 16x16 */ BLOCK_OP_C(16, 8) BLOCK_OP_C( 8, 16) BLOCK_OP_C(16, 4) BLOCK_OP_C(16, 12) BLOCK_OP_C( 4, 16) BLOCK_OP_C(12, 16) BLOCK_OP_C( 8, 8) /* 8x8 */ BLOCK_OP_C( 8, 4) BLOCK_OP_C( 4, 8) BLOCK_OP_C( 4, 4) /* 4x4 */ #define DECL_PIXELS(cpu) \ FUNCDEF_PU(void, pixel_avg, cpu, pel_t* dst, intptr_t dstride, const pel_t* src0, intptr_t sstride0, const pel_t* src1, intptr_t sstride1, int);\ FUNCDEF_PU(void, pixel_add_ps, cpu, pel_t* a, intptr_t dstride, const pel_t* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);\ FUNCDEF_PU(void, blockcopy_pp, cpu, pel_t *a, intptr_t stridea, const pel_t *b, intptr_t strideb);\ FUNCDEF_PU(void, blockcopy_ss, cpu, int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);\ FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pel_t*, intptr_t, intptr_t, intptr_t) DECL_PIXELS(mmx); DECL_PIXELS(mmx2); DECL_PIXELS(sse2); DECL_PIXELS(sse3); DECL_PIXELS(sse4); DECL_PIXELS(ssse3); DECL_PIXELS(avx); DECL_PIXELS(xop); DECL_PIXELS(avx2); #undef DECL_PIXELS #define ALL_LUMA_PU(name1, name2, cpu) \ pixf->name1[PART_64x64] = davs2_ ## name2 ## _64x64 ## cpu; /* 64x64 */ \ pixf->name1[PART_64x32] = davs2_ ## name2 ## _64x32 ## cpu;\ pixf->name1[PART_32x64] = davs2_ ## name2 ## _32x64 ## cpu;\ pixf->name1[PART_64x16] = davs2_ ## name2 ## _64x16 ## cpu;\ pixf->name1[PART_64x48] = davs2_ ## name2 ## _64x48 ## cpu;\ pixf->name1[PART_16x64] = davs2_ ## name2 ## _16x64 ## cpu;\ pixf->name1[PART_48x64] = davs2_ ## name2 ## _48x64 ## cpu;\ pixf->name1[PART_32x32] = davs2_ ## name2 ## _32x32 ## cpu; /* 32x32 */ \ pixf->name1[PART_32x16] = davs2_ ## name2 ## _32x16 ## cpu;\ pixf->name1[PART_16x32] = davs2_ ## name2 ## _16x32 ## cpu;\ pixf->name1[PART_32x8 ] = davs2_ ## name2 ## _32x8 ## cpu;\ pixf->name1[PART_32x24] = davs2_ ## name2 ## _32x24 ## cpu;\ pixf->name1[PART_8x32 ] = davs2_ ## name2 ## _8x32 ## cpu;\ pixf->name1[PART_24x32] = davs2_ ## name2 ## _24x32 ## cpu;\ pixf->name1[PART_16x16] = davs2_ ## name2 ## _16x16 ## cpu; /* 16x16 */ \ pixf->name1[PART_16x8 ] = davs2_ ## name2 ## _16x8 ## cpu;\ pixf->name1[PART_8x16 ] = davs2_ ## name2 ## _8x16 ## cpu;\ pixf->name1[PART_16x4 ] = davs2_ ## name2 ## _16x4 ## cpu;\ pixf->name1[PART_16x12] = davs2_ ## name2 ## _16x12 ## cpu;\ pixf->name1[PART_4x16 ] = davs2_ ## name2 ## _4x16 ## cpu;\ pixf->name1[PART_12x16] = davs2_ ## name2 ## _12x16 ## cpu;\ pixf->name1[PART_8x8 ] = davs2_ ## name2 ## _8x8 ## cpu; /* 8x8 */ \ pixf->name1[PART_8x4 ] = davs2_ ## name2 ## _8x4 ## cpu;\ pixf->name1[PART_4x8 ] = davs2_ ## name2 ## _4x8 ## cpu;\ pixf->name1[PART_4x4 ] = davs2_ ## name2 ## _4x4 ## cpu /* 4x4 */ void davs2_pixel_init(uint32_t cpuid, ao_funcs_t* pixf) { ALL_LUMA_PU(add_ps, pixel_add_ps, ); ALL_LUMA_PU(copy_pp, blockcopy_pp, ); ALL_LUMA_PU(copy_ss, blockcopy_ss, ); #if HAVE_MMX if (cpuid & DAVS2_CPU_SSE2) { #if HIGH_BIT_DEPTH //10bit assemble if (sizeof(pel_t) == sizeof(int16_t) && cpuid) { pixf->copy_pp[PART_64x64] = (copy_pp_t)davs2_blockcopy_ss_64x64_sse2; /* 64x64 */ pixf->copy_pp[PART_64x32] = (copy_pp_t)davs2_blockcopy_ss_64x32_sse2; pixf->copy_pp[PART_32x64] = (copy_pp_t)davs2_blockcopy_ss_32x64_sse2; pixf->copy_pp[PART_64x16] = (copy_pp_t)davs2_blockcopy_ss_64x16_sse2; pixf->copy_pp[PART_64x48] = (copy_pp_t)davs2_blockcopy_ss_64x48_sse2; pixf->copy_pp[PART_16x64] = (copy_pp_t)davs2_blockcopy_ss_16x64_sse2; pixf->copy_pp[PART_48x64] = (copy_pp_t)davs2_blockcopy_ss_48x64_sse2; pixf->copy_pp[PART_32x32] = (copy_pp_t)davs2_blockcopy_ss_32x32_sse2; /* 32x32 */ pixf->copy_pp[PART_32x16] = (copy_pp_t)davs2_blockcopy_ss_32x16_sse2; pixf->copy_pp[PART_16x32] = (copy_pp_t)davs2_blockcopy_ss_16x32_sse2; pixf->copy_pp[PART_32x8 ] = (copy_pp_t)davs2_blockcopy_ss_32x8_sse2; pixf->copy_pp[PART_32x24] = (copy_pp_t)davs2_blockcopy_ss_32x24_sse2; pixf->copy_pp[PART_8x32 ] = (copy_pp_t)davs2_blockcopy_ss_8x32_sse2; pixf->copy_pp[PART_24x32] = (copy_pp_t)davs2_blockcopy_ss_24x32_sse2; pixf->copy_pp[PART_16x16] = (copy_pp_t)davs2_blockcopy_ss_16x16_sse2; /* 16x16 */ pixf->copy_pp[PART_16x8 ] = (copy_pp_t)davs2_blockcopy_ss_16x8_sse2; pixf->copy_pp[PART_8x16 ] = (copy_pp_t)davs2_blockcopy_ss_8x16_sse2; pixf->copy_pp[PART_16x4 ] = (copy_pp_t)davs2_blockcopy_ss_16x4_sse2; pixf->copy_pp[PART_16x12] = (copy_pp_t)davs2_blockcopy_ss_16x12_sse2; pixf->copy_pp[PART_4x16 ] = (copy_pp_t)davs2_blockcopy_ss_4x16_sse2; pixf->copy_pp[PART_12x16] = (copy_pp_t)davs2_blockcopy_ss_12x16_sse2; pixf->copy_pp[PART_8x8 ] = (copy_pp_t)davs2_blockcopy_ss_8x8_sse2; /* 8x8 */ pixf->copy_pp[PART_8x4 ] = (copy_pp_t)davs2_blockcopy_ss_8x4_sse2; pixf->copy_pp[PART_4x8 ] = (copy_pp_t)davs2_blockcopy_ss_4x8_sse2; pixf->copy_pp[PART_4x4 ] = (copy_pp_t)davs2_blockcopy_ss_4x4_sse2; /* 4x4 */ } if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) { pixf->copy_ss[PART_64x64] = (copy_ss_t)davs2_blockcopy_ss_64x64_sse2; /* 64x64 */ pixf->copy_ss[PART_64x32] = (copy_ss_t)davs2_blockcopy_ss_64x32_sse2; pixf->copy_ss[PART_32x64] = (copy_ss_t)davs2_blockcopy_ss_32x64_sse2; pixf->copy_ss[PART_64x16] = (copy_ss_t)davs2_blockcopy_ss_64x16_sse2; pixf->copy_ss[PART_64x48] = (copy_ss_t)davs2_blockcopy_ss_64x48_sse2; pixf->copy_ss[PART_16x64] = (copy_ss_t)davs2_blockcopy_ss_16x64_sse2; pixf->copy_ss[PART_48x64] = (copy_ss_t)davs2_blockcopy_ss_48x64_sse2; pixf->copy_ss[PART_32x32] = (copy_ss_t)davs2_blockcopy_ss_32x32_sse2; /* 32x32 */ pixf->copy_ss[PART_32x16] = (copy_ss_t)davs2_blockcopy_ss_32x16_sse2; pixf->copy_ss[PART_16x32] = (copy_ss_t)davs2_blockcopy_ss_16x32_sse2; pixf->copy_ss[PART_32x8 ] = (copy_ss_t)davs2_blockcopy_ss_32x8_sse2; pixf->copy_ss[PART_32x24] = (copy_ss_t)davs2_blockcopy_ss_32x24_sse2; pixf->copy_ss[PART_8x32 ] = (copy_ss_t)davs2_blockcopy_ss_8x32_sse2; pixf->copy_ss[PART_24x32] = (copy_ss_t)davs2_blockcopy_ss_24x32_sse2; pixf->copy_ss[PART_16x16] = (copy_ss_t)davs2_blockcopy_ss_16x16_sse2; /* 16x16 */ pixf->copy_ss[PART_16x8 ] = (copy_ss_t)davs2_blockcopy_ss_16x8_sse2; pixf->copy_ss[PART_8x16 ] = (copy_ss_t)davs2_blockcopy_ss_8x16_sse2; pixf->copy_ss[PART_16x4 ] = (copy_ss_t)davs2_blockcopy_ss_16x4_sse2; pixf->copy_ss[PART_16x12] = (copy_ss_t)davs2_blockcopy_ss_16x12_sse2; pixf->copy_ss[PART_4x16 ] = (copy_ss_t)davs2_blockcopy_ss_4x16_sse2; pixf->copy_ss[PART_12x16] = (copy_ss_t)davs2_blockcopy_ss_12x16_sse2; pixf->copy_ss[PART_8x8 ] = (copy_ss_t)davs2_blockcopy_ss_8x8_sse2; /* 8x8 */ pixf->copy_ss[PART_8x4 ] = (copy_ss_t)davs2_blockcopy_ss_8x4_sse2; pixf->copy_ss[PART_4x8 ] = (copy_ss_t)davs2_blockcopy_ss_4x8_sse2; pixf->copy_ss[PART_4x4 ] = (copy_ss_t)davs2_blockcopy_ss_4x4_sse2; /* 4x4 */ } #else ALL_LUMA_PU(copy_pp, blockcopy_pp, _sse2); ALL_LUMA_PU(copy_ss, blockcopy_ss, _sse2); #endif } if (cpuid & DAVS2_CPU_SSE4) { #if HIGH_BIT_DEPTH //10bit assemble #else pixf->add_ps[PART_4x4 ] = davs2_pixel_add_ps_4x4_sse4; pixf->add_ps[PART_4x8 ] = davs2_pixel_add_ps_4x8_sse4; pixf->add_ps[PART_4x16 ] = davs2_pixel_add_ps_4x16_sse4; pixf->add_ps[PART_8x8 ] = davs2_pixel_add_ps_8x8_sse4; pixf->add_ps[PART_8x16 ] = davs2_pixel_add_ps_8x16_sse4; pixf->add_ps[PART_8x32 ] = davs2_pixel_add_ps_8x32_sse4; pixf->add_ps[PART_16x4 ] = davs2_pixel_add_ps_16x4_sse4; pixf->add_ps[PART_16x8 ] = davs2_pixel_add_ps_16x8_sse4; pixf->add_ps[PART_16x12] = davs2_pixel_add_ps_16x12_sse4; pixf->add_ps[PART_16x16] = davs2_pixel_add_ps_16x16_sse4; pixf->add_ps[PART_16x64] = davs2_pixel_add_ps_16x64_sse4; pixf->add_ps[PART_32x8 ] = davs2_pixel_add_ps_32x8_sse4; pixf->add_ps[PART_32x16] = davs2_pixel_add_ps_32x16_sse4; pixf->add_ps[PART_32x24] = davs2_pixel_add_ps_32x24_sse4; pixf->add_ps[PART_32x32] = davs2_pixel_add_ps_32x32_sse4; pixf->add_ps[PART_32x64] = davs2_pixel_add_ps_32x64_sse4; pixf->add_ps[PART_64x16] = davs2_pixel_add_ps_64x16_sse4; pixf->add_ps[PART_64x32] = davs2_pixel_add_ps_64x32_sse4; pixf->add_ps[PART_64x48] = davs2_pixel_add_ps_64x48_sse4; pixf->add_ps[PART_64x64] = davs2_pixel_add_ps_64x64_sse4; #endif } if (cpuid & DAVS2_CPU_AVX) { #if HIGH_BIT_DEPTH //10bit assemble if (sizeof(pel_t) == sizeof(int16_t) && cpuid) { pixf->copy_pp[PART_64x64] = (copy_pp_t)davs2_blockcopy_ss_64x64_avx; pixf->copy_pp[PART_64x32] = (copy_pp_t)davs2_blockcopy_ss_64x32_avx; pixf->copy_pp[PART_32x64] = (copy_pp_t)davs2_blockcopy_ss_32x64_avx; pixf->copy_pp[PART_64x16] = (copy_pp_t)davs2_blockcopy_ss_64x16_avx; pixf->copy_pp[PART_64x48] = (copy_pp_t)davs2_blockcopy_ss_64x48_avx; pixf->copy_pp[PART_16x64] = (copy_pp_t)davs2_blockcopy_ss_16x64_avx; pixf->copy_pp[PART_48x64] = (copy_pp_t)davs2_blockcopy_ss_48x64_avx; pixf->copy_pp[PART_32x32] = (copy_pp_t)davs2_blockcopy_ss_32x32_avx; pixf->copy_pp[PART_32x16] = (copy_pp_t)davs2_blockcopy_ss_32x16_avx; pixf->copy_pp[PART_16x32] = (copy_pp_t)davs2_blockcopy_ss_16x32_avx; pixf->copy_pp[PART_32x8 ] = (copy_pp_t)davs2_blockcopy_ss_32x8_avx; pixf->copy_pp[PART_32x24] = (copy_pp_t)davs2_blockcopy_ss_32x24_avx; pixf->copy_pp[PART_24x32] = (copy_pp_t)davs2_blockcopy_ss_24x32_avx; pixf->copy_pp[PART_16x16] = (copy_pp_t)davs2_blockcopy_ss_16x16_avx; pixf->copy_pp[PART_16x8 ] = (copy_pp_t)davs2_blockcopy_ss_16x8_avx; pixf->copy_pp[PART_16x4 ] = (copy_pp_t)davs2_blockcopy_ss_16x4_avx; pixf->copy_pp[PART_16x12] = (copy_pp_t)davs2_blockcopy_ss_16x12_avx; } if (sizeof(coeff_t) == sizeof(int16_t) && cpuid) { pixf->copy_ss[PART_64x64] = (copy_ss_t)davs2_blockcopy_ss_64x64_avx; pixf->copy_ss[PART_64x32] = (copy_ss_t)davs2_blockcopy_ss_64x32_avx; pixf->copy_ss[PART_32x64] = (copy_ss_t)davs2_blockcopy_ss_32x64_avx; pixf->copy_ss[PART_64x16] = (copy_ss_t)davs2_blockcopy_ss_64x16_avx; pixf->copy_ss[PART_64x48] = (copy_ss_t)davs2_blockcopy_ss_64x48_avx; pixf->copy_ss[PART_16x64] = (copy_ss_t)davs2_blockcopy_ss_16x64_avx; pixf->copy_ss[PART_48x64] = (copy_ss_t)davs2_blockcopy_ss_48x64_avx; pixf->copy_ss[PART_32x32] = (copy_ss_t)davs2_blockcopy_ss_32x32_avx; pixf->copy_ss[PART_32x16] = (copy_ss_t)davs2_blockcopy_ss_32x16_avx; pixf->copy_ss[PART_16x32] = (copy_ss_t)davs2_blockcopy_ss_16x32_avx; pixf->copy_ss[PART_32x8 ] = (copy_ss_t)davs2_blockcopy_ss_32x8_avx; pixf->copy_ss[PART_32x24] = (copy_ss_t)davs2_blockcopy_ss_32x24_avx; pixf->copy_ss[PART_24x32] = (copy_ss_t)davs2_blockcopy_ss_24x32_avx; pixf->copy_ss[PART_16x16] = (copy_ss_t)davs2_blockcopy_ss_16x16_avx; pixf->copy_ss[PART_16x8 ] = (copy_ss_t)davs2_blockcopy_ss_16x8_avx; pixf->copy_ss[PART_16x4 ] = (copy_ss_t)davs2_blockcopy_ss_16x4_avx; pixf->copy_ss[PART_16x12] = (copy_ss_t)davs2_blockcopy_ss_16x12_avx; } #else pixf->copy_pp[PART_64x64] = davs2_blockcopy_pp_64x64_avx; pixf->copy_pp[PART_64x32] = davs2_blockcopy_pp_64x32_avx; pixf->copy_pp[PART_32x64] = davs2_blockcopy_pp_32x64_avx; pixf->copy_pp[PART_64x16] = davs2_blockcopy_pp_64x16_avx; pixf->copy_pp[PART_64x48] = davs2_blockcopy_pp_64x48_avx; pixf->copy_pp[PART_48x64] = davs2_blockcopy_pp_48x64_avx; pixf->copy_pp[PART_32x32] = davs2_blockcopy_pp_32x32_avx; pixf->copy_pp[PART_32x16] = davs2_blockcopy_pp_32x16_avx; pixf->copy_pp[PART_32x8 ] = davs2_blockcopy_pp_32x8_avx; pixf->copy_pp[PART_32x24] = davs2_blockcopy_pp_32x24_avx; pixf->copy_ss[PART_64x64] = davs2_blockcopy_ss_64x64_avx; pixf->copy_ss[PART_64x32] = davs2_blockcopy_ss_64x32_avx; pixf->copy_ss[PART_32x64] = davs2_blockcopy_ss_32x64_avx; pixf->copy_ss[PART_64x16] = davs2_blockcopy_ss_64x16_avx; pixf->copy_ss[PART_64x48] = davs2_blockcopy_ss_64x48_avx; pixf->copy_ss[PART_16x64] = davs2_blockcopy_ss_16x64_avx; pixf->copy_ss[PART_48x64] = davs2_blockcopy_ss_48x64_avx; pixf->copy_ss[PART_32x32] = davs2_blockcopy_ss_32x32_avx; pixf->copy_ss[PART_32x16] = davs2_blockcopy_ss_32x16_avx; pixf->copy_ss[PART_16x32] = davs2_blockcopy_ss_16x32_avx; pixf->copy_ss[PART_32x8 ] = davs2_blockcopy_ss_32x8_avx; pixf->copy_ss[PART_32x24] = davs2_blockcopy_ss_32x24_avx; pixf->copy_ss[PART_24x32] = davs2_blockcopy_ss_24x32_avx; pixf->copy_ss[PART_16x16] = davs2_blockcopy_ss_16x16_avx; pixf->copy_ss[PART_16x8 ] = davs2_blockcopy_ss_16x8_avx; pixf->copy_ss[PART_16x4 ] = davs2_blockcopy_ss_16x4_avx; pixf->copy_ss[PART_16x12] = davs2_blockcopy_ss_16x12_avx; #endif } if (cpuid & DAVS2_CPU_AVX2) { #if HIGH_BIT_DEPTH //10bit assemble #else pixf->add_ps[PART_16x4 ] = davs2_pixel_add_ps_16x4_avx2; pixf->add_ps[PART_16x8 ] = davs2_pixel_add_ps_16x8_avx2; pixf->add_ps[PART_16x12] = davs2_pixel_add_ps_16x12_avx2; pixf->add_ps[PART_16x16] = davs2_pixel_add_ps_16x16_avx2; pixf->add_ps[PART_16x64] = davs2_pixel_add_ps_16x64_avx2; #if ARCH_X86_64 pixf->add_ps[PART_32x8 ] = davs2_pixel_add_ps_32x8_avx2; pixf->add_ps[PART_32x16] = davs2_pixel_add_ps_32x16_avx2; pixf->add_ps[PART_32x24] = davs2_pixel_add_ps_32x24_avx2; pixf->add_ps[PART_32x32] = davs2_pixel_add_ps_32x32_avx2; pixf->add_ps[PART_32x64] = davs2_pixel_add_ps_32x64_avx2; #endif pixf->add_ps[PART_64x16] = davs2_pixel_add_ps_64x16_avx2; pixf->add_ps[PART_64x32] = davs2_pixel_add_ps_64x32_avx2; pixf->add_ps[PART_64x48] = davs2_pixel_add_ps_64x48_avx2; pixf->add_ps[PART_64x64] = davs2_pixel_add_ps_64x64_avx2; #endif } #endif // HAVE_MMX } #ifdef __cplusplus } #endif davs2-1.6/source/common/predict.cc000066400000000000000000000615551337322544400171530ustar00rootroot00000000000000/* * predict.cc * * Description of this file: * Prediction functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "predict.h" #include "block_info.h" /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void check_scaling_neighbor_mv(davs2_t *h, mv_t *mv, int mult_distance, int ref_neighbor) { if (ref_neighbor >= 0) { int devide_distance = get_distance_index_p(h, ref_neighbor); int devide_distance_src = get_distance_index_p_scale(h, ref_neighbor); mv->y = scale_mv_default_y(h, mv->y, mult_distance, devide_distance, devide_distance_src); mv->x = scale_mv_default (h, mv->x, mult_distance, devide_distance_src); } else { mv->v = 0; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void check_scaling_neighbor_mv_b(davs2_t *h, mv_t *mv, int mult_distance, int mult_distance_src, int ref_neighbor) { if (ref_neighbor >= 0) { mv->y = scale_mv_default_y(h, mv->y, mult_distance, mult_distance, mult_distance_src); mv->x = scale_mv_default(h, mv->x, mult_distance, mult_distance_src); } else { mv->v = 0; } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int recheck_neighbor_ref_avail(davs2_t *h, int ref_frame, int neighbor_ref) { if (neighbor_ref != -1) { if (((ref_frame == h->num_of_references - 1 && neighbor_ref != h->num_of_references - 1) || (ref_frame != h->num_of_references - 1 && neighbor_ref == h->num_of_references - 1)) && (h->i_frame_type == AVS2_P_SLICE || h->i_frame_type == AVS2_F_SLICE) && h->b_bkgnd_picture) { neighbor_ref = -1; } if (h->i_frame_type == AVS2_S_SLICE) { neighbor_ref = -1; } } return neighbor_ref; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int derive_mv_pred_type(int ref_frame, int rFrameL, int rFrameU, int rFrameUR, int pu_type_for_mvp) { int mvp_type = MVPRED_xy_MIN; if ((rFrameL != INVALID_REF) && (rFrameU == INVALID_REF) && (rFrameUR == INVALID_REF)) { mvp_type = MVPRED_L; } else if ((rFrameL == INVALID_REF) && (rFrameU != INVALID_REF) && (rFrameUR == INVALID_REF)) { mvp_type = MVPRED_U; } else if ((rFrameL == INVALID_REF) && (rFrameU == INVALID_REF) && (rFrameUR != INVALID_REF)) { mvp_type = MVPRED_UR; } else { switch (pu_type_for_mvp) { case 1: case 4: if (rFrameL == ref_frame) { mvp_type = MVPRED_L; } break; case 2: if (rFrameUR == ref_frame) { mvp_type = MVPRED_UR; } break; case 3: if (rFrameU == ref_frame) { mvp_type = MVPRED_U; } break; default: break; } } return mvp_type; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int16_t derive_median_mv(int mva, int mvb, int mvc) { int mvp; if (((mva < 0) && (mvb > 0) && (mvc > 0)) || ((mva > 0) && (mvb < 0) && (mvc < 0))) { mvp = (mvb + mvc) / 2; // b } else if (((mvb < 0) && (mva > 0) && (mvc > 0)) || ((mvb > 0) && (mva < 0) && (mvc < 0))) { mvp = (mvc + mva) / 2; // c } else if (((mvc < 0) && (mva > 0) && (mvb > 0)) || ((mvc > 0) && (mva < 0) && (mvb < 0))) { mvp = (mva + mvb) / 2; // a } else { const int dAB = DAVS2_ABS(mva - mvb); // for Ax const int dBC = DAVS2_ABS(mvb - mvc); // for Bx const int dCA = DAVS2_ABS(mvc - mva); // for Cx const int min_diff = DAVS2_MIN(dAB, DAVS2_MIN(dBC, dCA)); if (min_diff == dAB) { mvp = (mva + mvb) / 2; // a; } else if (min_diff == dBC) { mvp = (mvb + mvc) / 2; // b; } else { mvp = (mvc + mva) / 2; // c; } } return (int16_t)mvp; } /* --------------------------------------------------------------------------- * get neighboring MVs for MVP */ static ALWAYS_INLINE void cu_get_neighbors_default_mvp(davs2_t *h, cu_t *p_cu, int pix_cu_x, int pix_cu_y, int bsx) { neighbor_inter_t *neighbors = h->lcu.neighbor_inter; int cur_slice_idx = p_cu->i_slice_nr; int x0 = pix_cu_x >> MIN_PU_SIZE_IN_BIT; int y0 = pix_cu_y >> MIN_PU_SIZE_IN_BIT; int x1 = (bsx >> MIN_PU_SIZE_IN_BIT) + x0 - 1; /* 1. check whether the top-right 4x4 block is reconstructed */ int x4_TR = x1 - h->lcu.i_spu_x; int y4_TR = y0 - h->lcu.i_spu_y; int avail_TR = h->p_tab_TR_avail[(y4_TR << (h->i_lcu_level - B4X4_IN_BIT)) + x4_TR]; /* 2. get neighboring blocks */ cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_LEFT ], x0 - 1, y0 ); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOP ], x0 , y0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPLEFT ], x0 - 1, y0 - 1); cu_get_neighbor_spatial(h, cur_slice_idx, &neighbors[BLK_TOPRIGHT], avail_TR ? x1 + 1 : -1, y0 - 1); } /* --------------------------------------------------------------------------- * set motion vector predictor */ void get_mvp_default(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, mv_t *pmv, int bwd_2nd, int ref_frame, int bsx, int pu_type_for_mvp) { int mvPredType, rFrameL, rFrameU, rFrameUL, rFrameUR; mv_t mva, mvb, mvc, mvd; int is_available_UR; cu_get_neighbors_default_mvp(h, p_cu, pix_x, pix_y, bsx); is_available_UR = h->lcu.neighbor_inter[BLK_TOPRIGHT].is_available; rFrameL = h->lcu.neighbor_inter[BLK_LEFT ].ref_idx.r[bwd_2nd]; rFrameU = h->lcu.neighbor_inter[BLK_TOP ].ref_idx.r[bwd_2nd]; rFrameUL = h->lcu.neighbor_inter[BLK_TOPLEFT].ref_idx.r[bwd_2nd]; rFrameUR = is_available_UR ? h->lcu.neighbor_inter[BLK_TOPRIGHT].ref_idx.r[bwd_2nd] : rFrameUL; mva = h->lcu.neighbor_inter[BLK_LEFT ].mv[bwd_2nd]; mvb = h->lcu.neighbor_inter[BLK_TOP ].mv[bwd_2nd]; mvd = h->lcu.neighbor_inter[BLK_TOPLEFT].mv[bwd_2nd]; mvc = is_available_UR ? h->lcu.neighbor_inter[BLK_TOPRIGHT].mv[bwd_2nd] : mvd; rFrameL = recheck_neighbor_ref_avail(h, ref_frame, rFrameL); rFrameU = recheck_neighbor_ref_avail(h, ref_frame, rFrameU); rFrameUR = recheck_neighbor_ref_avail(h, ref_frame, rFrameUR); mvPredType = derive_mv_pred_type(ref_frame, rFrameL, rFrameU, rFrameUR, pu_type_for_mvp); if (h->i_frame_type == AVS2_B_SLICE) { int mult_distance = get_distance_index_b(h, bwd_2nd ? B_BWD : B_FWD); int mult_distance_src = get_distance_index_b_scale(h, bwd_2nd ? B_BWD : B_FWD); check_scaling_neighbor_mv_b(h, &mva, mult_distance, mult_distance_src, rFrameL); check_scaling_neighbor_mv_b(h, &mvb, mult_distance, mult_distance_src, rFrameU); check_scaling_neighbor_mv_b(h, &mvc, mult_distance, mult_distance_src, rFrameUR); } else { int mult_distance = get_distance_index_p(h, ref_frame); check_scaling_neighbor_mv(h, &mva, mult_distance, rFrameL); check_scaling_neighbor_mv(h, &mvb, mult_distance, rFrameU); check_scaling_neighbor_mv(h, &mvc, mult_distance, rFrameUR); } switch (mvPredType) { case MVPRED_xy_MIN: pmv->x = derive_median_mv(mva.x, mvb.x, mvc.x); // x pmv->y = derive_median_mv(mva.y, mvb.y, mvc.y); // y break; case MVPRED_L: pmv->v = mva.v; break; case MVPRED_U: pmv->v = mvb.v; break; default: // case MVPRED_UR: pmv->v = mvc.v; break; } } /* --------------------------------------------------------------------------- */ static void get_mv_bskip_spatial(davs2_t *h, mv_t *fw_pmv, mv_t *bw_pmv, int num_skip_dir) { neighbor_inter_t *p_neighbors = h->lcu.neighbor_inter; mv_t *p_mv_1st = h->lcu.mv_tskip_1st; mv_t *p_mv_2nd = h->lcu.mv_tskip_2nd; int j; int bid_flag = 0, bw_flag = 0, fw_flag = 0, sym_flag = 0, bid2 = 0; memset(h->lcu.mv_tskip_1st, 0, sizeof(h->lcu.mv_tskip_1st) + sizeof(h->lcu.mv_tskip_2nd)); for (j = 0; j < 6; j++) { if (p_neighbors[j].i_dir_pred == PDIR_BID) { p_mv_2nd[DS_B_BID] = p_neighbors[j].mv[1]; p_mv_1st[DS_B_BID] = p_neighbors[j].mv[0]; bid_flag++; if (bid_flag == 1) { bid2 = j; } } else if (p_neighbors[j].i_dir_pred == PDIR_SYM) { p_mv_2nd[DS_B_SYM] = p_neighbors[j].mv[1]; p_mv_1st[DS_B_SYM] = p_neighbors[j].mv[0]; sym_flag++; } else if (p_neighbors[j].i_dir_pred == PDIR_BWD) { p_mv_2nd[DS_B_BWD] = p_neighbors[j].mv[1]; bw_flag++; } else if (p_neighbors[j].i_dir_pred == PDIR_FWD) { p_mv_1st[DS_B_FWD] = p_neighbors[j].mv[0]; fw_flag++; } } if (bid_flag == 0 && fw_flag != 0 && bw_flag != 0) { p_mv_2nd[DS_B_BID] = p_mv_2nd[DS_B_BWD]; p_mv_1st[DS_B_BID] = p_mv_1st[DS_B_FWD ]; } if (sym_flag == 0 && bid_flag > 1) { p_mv_2nd[DS_B_SYM] = p_neighbors[bid2].mv[1]; p_mv_1st[DS_B_SYM] = p_neighbors[bid2].mv[0]; } else if (sym_flag == 0 && bw_flag != 0) { p_mv_2nd[DS_B_SYM].v = p_mv_2nd[DS_B_BWD].v; p_mv_1st[DS_B_SYM].x = -p_mv_2nd[DS_B_BWD].x; p_mv_1st[DS_B_SYM].y = -p_mv_2nd[DS_B_BWD].y; } else if (sym_flag == 0 && fw_flag != 0) { p_mv_2nd[DS_B_SYM].x = -p_mv_1st[DS_B_FWD].x; p_mv_2nd[DS_B_SYM].y = -p_mv_1st[DS_B_FWD].y; p_mv_1st[DS_B_SYM].v = p_mv_1st[DS_B_FWD].v; } if (bw_flag == 0 && bid_flag > 1) { p_mv_2nd[DS_B_BWD] = p_neighbors[bid2].mv[1]; } else if (bw_flag == 0 && bid_flag != 0) { p_mv_2nd[DS_B_BWD] = p_mv_2nd[DS_B_BID]; } if (fw_flag == 0 && bid_flag > 1) { p_mv_1st[DS_B_FWD] = p_neighbors[bid2].mv[0]; } else if (fw_flag == 0 && bid_flag != 0) { p_mv_1st[DS_B_FWD] = p_mv_1st[DS_B_BID]; } fw_pmv->v = p_mv_1st[num_skip_dir].v; bw_pmv->v = p_mv_2nd[num_skip_dir].v; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void get_mv_pf_skip_temporal(davs2_t *h, mv_t *p_mv, int block_offset, int cur_dist) { int refframe = h->fref[0]->refbuf[block_offset]; if (refframe >= 0) { mv_t tmv = h->fref[0]->mvbuf[block_offset]; int col_dist = h->fref[0]->dist_scale_refs[refframe]; p_mv->x = scale_mv_skip(h, tmv.x, cur_dist, col_dist); p_mv->y = scale_mv_skip(h, tmv.y, cur_dist, col_dist); } else { p_mv->v = 0; } } /* --------------------------------------------------------------------------- */ static void fill_mv_pf_skip_temporal(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, int cu_size) { int spu_x = pix_x >> MIN_PU_SIZE_IN_BIT; int spu_y = pix_y >> MIN_PU_SIZE_IN_BIT; int size_in_spu = cu_size >> MIN_PU_SIZE_IN_BIT; int width_in_spu = h->i_width_in_spu; int i, l, m; mv_t mv_1st, mv_2nd; ref_idx_t ref_idx; int delta[AVS2_MAX_REFS]; int delta_src[AVS2_MAX_REFS]; ref_idx.r[0] = 0; ref_idx.r[1] = (int8_t)(p_cu->i_weighted_skipmode != 0 ? p_cu->i_weighted_skipmode : INVALID_REF); for (i = 0; i < h->num_of_references; i++) { delta[i] = get_distance_index_p(h, i); delta_src[i] = get_distance_index_p_scale(h, i); } if (cu_size != MIN_CU_SIZE) { size_in_spu >>= 1; assert(p_cu->num_pu == 4); } else { assert(p_cu->num_pu == 1); } for (i = 0; i < p_cu->num_pu; i++) { int block_x = spu_x + size_in_spu * (i & 1); int block_y = spu_y + size_in_spu * (i >> 1); int block_offset = block_y * width_in_spu + block_x; mv_t *p_mv_1st = h->p_tmv_1st + block_offset; mv_t *p_mv_2nd = h->p_tmv_2nd + block_offset; ref_idx_t *p_ref_1st = h->p_ref_idx + block_offset; get_mv_pf_skip_temporal(h, &mv_1st, block_offset, delta[0]); if (ref_idx.r[1] != INVALID_REF) { mv_2nd.x = scale_mv_skip (h, mv_1st.x, delta[ref_idx.r[1]], delta_src[0]); mv_2nd.y = scale_mv_skip_y(h, mv_1st.y, delta[ref_idx.r[1]], delta[0], delta_src[0]); } else { mv_2nd.v = 0; } p_cu->mv[i][0].v = mv_1st.v; p_cu->mv[i][1].v = mv_2nd.v; p_cu->ref_idx[i] = ref_idx; for (m = 0; m < size_in_spu; m++) { for (l = 0; l < size_in_spu; l++) { p_mv_1st[l] = mv_1st; p_mv_2nd[l] = mv_2nd; p_ref_1st[l] = ref_idx; } p_mv_1st += width_in_spu; p_mv_2nd += width_in_spu; p_ref_1st += width_in_spu; } } } /* --------------------------------------------------------------------------- */ static INLINE void get_mv_fskip_spatial(davs2_t *h) { neighbor_inter_t *p_neighbors = h->lcu.neighbor_inter; int bid_flag = 0, fw_flag = 0, bid2 = 0, fw2 = 0; int j; memset(h->lcu.ref_skip_1st, 0, sizeof(h->lcu.ref_skip_1st) + sizeof(h->lcu.ref_skip_2nd) + sizeof(h->lcu.mv_tskip_1st) + sizeof(h->lcu.mv_tskip_2nd)); for (j = 0; j < 6; j++) { if (p_neighbors[j].ref_idx.r[0] != -1 && p_neighbors[j].ref_idx.r[1] != -1) { // bid h->lcu.ref_skip_1st[DS_DUAL_1ST] = p_neighbors[j].ref_idx.r[0]; h->lcu.ref_skip_2nd[DS_DUAL_1ST] = p_neighbors[j].ref_idx.r[1]; h->lcu.mv_tskip_1st[DS_DUAL_1ST] = p_neighbors[j].mv[0]; h->lcu.mv_tskip_2nd[DS_DUAL_1ST] = p_neighbors[j].mv[1]; bid_flag++; if (bid_flag == 1) { bid2 = j; } } else if (p_neighbors[j].ref_idx.r[0] != -1 && p_neighbors[j].ref_idx.r[1] == -1) { // fw h->lcu.ref_skip_1st[DS_SINGLE_1ST] = p_neighbors[j].ref_idx.r[0]; h->lcu.mv_tskip_1st[DS_SINGLE_1ST] = p_neighbors[j].mv[0]; fw_flag++; if (fw_flag == 1) { fw2 = j; } } } // first bid if (bid_flag == 0 && fw_flag > 1) { h->lcu.ref_skip_1st[DS_DUAL_1ST] = h->lcu.ref_skip_1st[DS_SINGLE_1ST]; h->lcu.ref_skip_2nd[DS_DUAL_1ST] = p_neighbors[fw2].ref_idx.r[0]; h->lcu.mv_tskip_1st[DS_DUAL_1ST] = h->lcu.mv_tskip_1st[DS_SINGLE_1ST]; h->lcu.mv_tskip_2nd[DS_DUAL_1ST] = p_neighbors[fw2].mv[0]; } // second bid if (bid_flag > 1) { h->lcu.ref_skip_1st[DS_DUAL_2ND] = p_neighbors[bid2].ref_idx.r[0]; h->lcu.ref_skip_2nd[DS_DUAL_2ND] = p_neighbors[bid2].ref_idx.r[1]; h->lcu.mv_tskip_1st[DS_DUAL_2ND] = p_neighbors[bid2].mv[0]; h->lcu.mv_tskip_2nd[DS_DUAL_2ND] = p_neighbors[bid2].mv[1]; } else if (bid_flag == 1 && fw_flag > 1) { h->lcu.ref_skip_1st[DS_DUAL_2ND] = h->lcu.ref_skip_1st[DS_SINGLE_1ST]; h->lcu.ref_skip_2nd[DS_DUAL_2ND] = p_neighbors[fw2].ref_idx.r[0]; h->lcu.mv_tskip_1st[DS_DUAL_2ND] = h->lcu.mv_tskip_1st[DS_SINGLE_1ST]; h->lcu.mv_tskip_2nd[DS_DUAL_2ND] = p_neighbors[fw2].mv[0]; } // first fwd h->lcu.ref_skip_2nd[DS_SINGLE_1ST] = INVALID_REF; h->lcu.mv_tskip_2nd [DS_SINGLE_1ST].v = 0; if (fw_flag == 0 && bid_flag > 1) { h->lcu.ref_skip_1st[DS_SINGLE_1ST] = p_neighbors[bid2].ref_idx.r[0]; h->lcu.mv_tskip_1st[DS_SINGLE_1ST] = p_neighbors[bid2].mv[0]; } else if (fw_flag == 0 && bid_flag == 1) { h->lcu.ref_skip_1st[DS_SINGLE_1ST] = h->lcu.ref_skip_1st[DS_DUAL_1ST]; h->lcu.mv_tskip_1st[DS_SINGLE_1ST] = h->lcu.mv_tskip_1st[DS_DUAL_1ST]; } // second fwd h->lcu.ref_skip_2nd[DS_SINGLE_2ND] = INVALID_REF; h->lcu.mv_tskip_2nd [DS_SINGLE_2ND].v = 0; if (fw_flag > 1) { h->lcu.ref_skip_1st[DS_SINGLE_2ND] = p_neighbors[fw2].ref_idx.r[0]; h->lcu.mv_tskip_1st[DS_SINGLE_2ND] = p_neighbors[fw2].mv[0]; } else if (bid_flag > 1) { h->lcu.ref_skip_1st[DS_SINGLE_2ND] = p_neighbors[bid2].ref_idx.r[1]; h->lcu.mv_tskip_1st[DS_SINGLE_2ND] = p_neighbors[bid2].mv[1]; } else if (bid_flag == 1) { h->lcu.ref_skip_1st[DS_SINGLE_2ND] = h->lcu.ref_skip_2nd[DS_DUAL_1ST]; h->lcu.mv_tskip_1st[DS_SINGLE_2ND] = h->lcu.mv_tskip_2nd[DS_DUAL_1ST]; } } /* --------------------------------------------------------------------------- */ static void fill_mv_bskip(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, int size_in_scu) { int width_in_spu = h->i_width_in_spu; int i8_1st = pix_x >> MIN_PU_SIZE_IN_BIT; int j8_1st = pix_y >> MIN_PU_SIZE_IN_BIT; int i; int8_t *p_dirpred; ref_idx_t *p_ref_1st; mv_t *p_mv_1st; mv_t *p_mv_2nd; mv_t mv_1st, mv_2nd; int ds_mode = p_cu->i_md_directskip_mode; assert(h->i_frame_type == AVS2_B_SLICE); if (ds_mode != DS_NONE) { int offset_spu = j8_1st * width_in_spu + i8_1st; int r, c; int cu_size_in_spu = size_in_scu << (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); ref_idx_t ref_idx; int8_t i_dir_pred; p_mv_1st = h->p_tmv_1st + offset_spu; p_mv_2nd = h->p_tmv_2nd + offset_spu; p_ref_1st = h->p_ref_idx + offset_spu; p_dirpred = h->p_dirpred + offset_spu; i_dir_pred = (int8_t)p_cu->b8pdir[0]; switch (ds_mode) { case DS_B_SYM: case DS_B_BID: ref_idx.r[0] = B_FWD; ref_idx.r[1] = B_BWD; break; case DS_B_BWD: ref_idx.r[0] = INVALID_REF; ref_idx.r[1] = B_BWD; break; // case DS_B_FWD: default: ref_idx.r[0] = B_FWD; ref_idx.r[1] = INVALID_REF; break; } get_mv_bskip_spatial(h, &mv_1st, &mv_2nd, p_cu->i_md_directskip_mode); p_cu->mv[0][0].v = mv_1st.v; p_cu->mv[0][1].v = mv_2nd.v; p_cu->ref_idx[0] = ref_idx; for (r = 0; r < cu_size_in_spu; r++) { for (c = 0; c < cu_size_in_spu; c++) { p_ref_1st[c] = ref_idx; p_mv_1st [c] = mv_1st; p_mv_2nd [c] = mv_2nd; p_dirpred[c] = i_dir_pred; } p_ref_1st += width_in_spu; p_mv_1st += width_in_spu; p_mv_2nd += width_in_spu; p_dirpred += width_in_spu; } } else { // B_Skip_Sym B_Direct_Sym int size_cu = size_in_scu << MIN_CU_SIZE_IN_BIT; int size_pu = size_cu >> (p_cu->num_pu == 4); int size_pu_in_spu = size_pu >> MIN_PU_SIZE_IN_BIT; ref_idx_t ref_idx; ref_idx.r[0] = B_FWD; ref_idx.r[1] = B_BWD; for (i = 0; i < p_cu->num_pu; i++) { int i8 = i8_1st + (i & 1) * size_in_scu; int j8 = j8_1st + (i >> 1) * size_in_scu; int r, c; int offset_spu = j8 * width_in_spu + i8; const int8_t *refbuf = h->fref[0]->refbuf; int refframe = refbuf[j8 * width_in_spu + i8]; p_mv_1st = h->p_tmv_1st + offset_spu; p_mv_2nd = h->p_tmv_2nd + offset_spu; p_ref_1st = h->p_ref_idx + offset_spu; p_dirpred = h->p_dirpred + offset_spu; if (refframe == -1) { get_mvp_default(h, p_cu, pix_x, pix_y, &mv_1st, 0, 0, size_cu, 0); get_mvp_default(h, p_cu, pix_x, pix_y, &mv_2nd, 1, 0, size_cu, 0); } else { // next P is skip or inter mode int iTRp = h->fref[0]->dist_refs[refframe]; int iTRp_src = h->fref[0]->dist_scale_refs[refframe]; int iTRd = get_distance_index_b(h, B_BWD); // bwd int iTRb = get_distance_index_b(h, B_FWD); // fwd mv_t tmv = h->fref[0]->mvbuf[j8 * width_in_spu + i8]; mv_1st.x = scale_mv_biskip(h, tmv.x, iTRb, iTRp_src); mv_2nd.x = -scale_mv_biskip(h, tmv.x, iTRd, iTRp_src); mv_1st.y = scale_mv_biskip_y(h, tmv.y, iTRb, iTRp, iTRp_src); mv_2nd.y = -scale_mv_biskip_y(h, tmv.y, iTRd, iTRp, iTRp_src); } p_cu->mv[i][0].v = mv_1st.v; p_cu->mv[i][1].v = mv_2nd.v; p_cu->ref_idx[i].v = ref_idx.v; for (r = 0; r < size_pu_in_spu; r++) { for (c = 0; c < size_pu_in_spu; c++) { p_mv_1st [c] = mv_1st; p_mv_2nd [c] = mv_2nd; p_ref_1st[c].v = ref_idx.v; p_dirpred[c] = PDIR_SYM; } p_ref_1st += width_in_spu; p_mv_1st += width_in_spu; p_mv_2nd += width_in_spu; p_dirpred += width_in_spu; } } // for loop all PUs } // B_Skip_Sym B_Direct_Sym } /* --------------------------------------------------------------------------- * Skip/Directģʽ£ڿ˶ϢõǰCUп˶Ϣ * òο֡˶ʸ */ void fill_mv_and_ref_for_skip(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, int size_in_scu) { assert(p_cu->i_cu_type == PRED_SKIP); if (h->i_frame_type == AVS2_B_SLICE) { fill_mv_bskip(h, p_cu, pix_x, pix_y, size_in_scu); } else if ((h->i_frame_type == AVS2_F_SLICE) || (h->i_frame_type == AVS2_P_SLICE)) { if (p_cu->i_md_directskip_mode == 0) { fill_mv_pf_skip_temporal(h, p_cu, pix_x, pix_y, size_in_scu << MIN_CU_SIZE_IN_BIT); } else { int width_in_spu = h->i_width_in_spu; int block_offset = (pix_y >> MIN_PU_SIZE_IN_BIT) * width_in_spu + (pix_x >> MIN_PU_SIZE_IN_BIT); ref_idx_t *p_ref_1st = h->p_ref_idx + block_offset; mv_t *p_tmv_1st = h->p_tmv_1st + block_offset; mv_t *p_tmv_2nd = h->p_tmv_2nd + block_offset; int i, j; mv_t mv_1st, mv_2nd; ref_idx_t ref_idx; int ds_mode = p_cu->i_md_directskip_mode; get_mv_fskip_spatial(h); mv_1st = h->lcu.mv_tskip_1st[ds_mode]; mv_2nd = h->lcu.mv_tskip_2nd[ds_mode]; ref_idx.r[0] = h->lcu.ref_skip_1st[ds_mode]; ref_idx.r[1] = h->lcu.ref_skip_2nd[ds_mode]; for (i = 0; i < 4; i++) { p_cu->mv[i][0].v = mv_1st.v; p_cu->mv[i][1].v = mv_2nd.v; p_cu->ref_idx[i] = ref_idx; } size_in_scu <<= (MIN_CU_SIZE_IN_BIT - MIN_PU_SIZE_IN_BIT); for (j = 0; j < size_in_scu; j++) { for (i = 0; i < size_in_scu; i++) { p_ref_1st[i] = ref_idx; p_tmv_1st[i] = mv_1st; p_tmv_2nd[i] = mv_2nd; } p_ref_1st += width_in_spu; p_tmv_1st += width_in_spu; p_tmv_2nd += width_in_spu; } } } } davs2-1.6/source/common/predict.h000066400000000000000000000252551337322544400170120ustar00rootroot00000000000000/* * predict.h * * Description of this file: * Prediction functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_PRED_H #define DAVS2_PRED_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- * ο֡ʱ鲢ЧΧ */ #define AVS2_DISTANCE_INDEX(distance) (((distance) + 512) & 511) /* --------------------------------------------------------------------------- * P/F֡IJο֡뵱ǰ֮֡ľ */ static ALWAYS_INLINE int get_distance_index_p(davs2_t *h, int refidx) { return h->fdec->dist_refs[refidx]; } /* --------------------------------------------------------------------------- * P/F֡IJο֡뵱ǰ֮֡ľ */ static ALWAYS_INLINE int get_distance_index_p_scale(davs2_t *h, int refidx) { return h->fdec->dist_scale_refs[refidx]; } /* --------------------------------------------------------------------------- * B֡IJο֡뵱ǰ֮֡ľ */ static ALWAYS_INLINE int get_distance_index_b(davs2_t *h, int b_fwd) { return h->fdec->dist_refs[b_fwd]; } /* --------------------------------------------------------------------------- * B֡IJο֡뵱ǰ֮֡ľ */ static ALWAYS_INLINE int get_distance_index_b_scale(davs2_t *h, int b_fwd) { return h->fdec->dist_scale_refs[b_fwd]; } /* --------------------------------------------------------------------------- * ڳYŵƫ */ static ALWAYS_INLINE int getDeltas(davs2_t *h, int *delt, int *delt2, int OriPOC, int OriRefPOC, int ScaledPOC, int ScaledRefPOC) { int factor = 2; *delt = 0; *delt2 = 0; assert(h->seq_info.b_field_coding); assert(h->i_pic_coding_type == FRAME); OriPOC = AVS2_DISTANCE_INDEX(OriPOC); OriRefPOC = AVS2_DISTANCE_INDEX(OriRefPOC); ScaledPOC = AVS2_DISTANCE_INDEX(ScaledPOC); ScaledRefPOC = AVS2_DISTANCE_INDEX(ScaledRefPOC); assert((OriPOC % factor) + (OriRefPOC % factor) + (ScaledPOC % factor) + (ScaledRefPOC % factor) == 0); OriPOC /= factor; OriRefPOC /= factor; ScaledPOC /= factor; ScaledRefPOC /= factor; if (h->b_top_field) { // scaled is top field *delt2 = (ScaledRefPOC % 2) != (ScaledPOC % 2) ? 2 : 0; if ((ScaledPOC % 2) == (OriPOC % 2)) { // ori is top *delt = (OriRefPOC % 2) != (OriPOC % 2) ? 2 : 0; } else { *delt = (OriRefPOC % 2) != (OriPOC % 2) ? -2 : 0; } } else { // scaled is bottom field *delt2 = (ScaledRefPOC % 2) != (ScaledPOC % 2) ? -2 : 0; if ((ScaledPOC % 2) == (OriPOC % 2)) { // ori is bottom *delt = (OriRefPOC % 2) != (OriPOC % 2) ? -2 : 0; } else { *delt = (OriRefPOC % 2) != (OriPOC % 2) ? 2 : 0; } } return 0; } /* --------------------------------------------------------------------------- * MV scaling for Normal Inter Mode (MVP + MVD) */ static ALWAYS_INLINE int16_t scale_mv_default(davs2_t *h, int mv, int dist_dst, int dist_src) { UNUSED_PARAMETER(h); mv = davs2_sign3(mv) * ((DAVS2_ABS(mv) * dist_dst * dist_src + HALF_MULTI) >> OFFSET); return (int16_t)(DAVS2_CLIP3(-32768, 32767, mv)); } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int16_t scale_mv_default_y(davs2_t *h, int mvy, int dist_dst, int dist_src, int dist_src_mul) { if (h->seq_info.b_field_coding) { int oriPOC = h->fdec->i_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_default(h, mvy + delta, dist_dst, dist_src_mul) - delta2); } else { return scale_mv_default(h, mvy, dist_dst, dist_src_mul); } } // ---------------------------------------------------------- // MV scaling for Skip/Direct Mode static ALWAYS_INLINE int16_t scale_mv_skip(davs2_t *h, int mv, int dist_dst, int dist_src) { UNUSED_PARAMETER(h); mv = (int16_t)((mv * dist_dst * dist_src + HALF_MULTI) >> OFFSET); return (int16_t)(DAVS2_CLIP3(-32768, 32767, mv)); } static ALWAYS_INLINE int16_t scale_mv_skip_y(davs2_t *h, int mvy, int dist_dst, int dist_src ,int dist_src_mul) { if (h->seq_info.b_field_coding) { int oriPOC = h->fdec->i_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_skip(h, mvy + delta, dist_dst, dist_src_mul) - delta2); } else { return scale_mv_skip(h, mvy, dist_dst, dist_src_mul); } } // ---------------------------------------------------------- // MV scaling for Bi-Skip/Direct Mode static ALWAYS_INLINE int16_t scale_mv_biskip(davs2_t *h, int mv, int dist_dst, int dist_src) { UNUSED_PARAMETER(h); mv = (int16_t)(davs2_sign3(mv) * ((dist_src * (1 + DAVS2_ABS(mv) * dist_dst) - 1) >> OFFSET)); return (int16_t)(DAVS2_CLIP3(-32768, 32767, mv)); } static ALWAYS_INLINE int16_t scale_mv_biskip_y(davs2_t *h, int mvy, int dist_dst, int dist_src, int dist_src_mul) { if (h->seq_info.b_field_coding) { int oriPOC = h->fdec->i_poc; int oriRefPOC = oriPOC - dist_src; int scaledPOC = h->fdec->i_poc; int scaledRefPOC = scaledPOC - dist_dst; int delta, delta2; getDeltas(h, &delta, &delta2, oriPOC, oriRefPOC, scaledPOC, scaledRefPOC); return (int16_t)(scale_mv_biskip(h, mvy + delta, dist_dst, dist_src_mul) - delta2); } else { return scale_mv_biskip(h, mvy, dist_dst, dist_src_mul); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void pmvr_mv_derivation(davs2_t *h, mv_t *mv, mv_t *mvd, mv_t *mvp) { int mvx, mvy; if (h->seq_info.enable_pmvr) { int ctr_x, ctr_y; ctr_x = ((mvp->x >> 1) << 1) - mvp->x; ctr_y = ((mvp->y >> 1) << 1) - mvp->y; if (DAVS2_ABS(mvd->x - ctr_x) > THRESHOLD_PMVR) { mvx = mvp->x + (mvd->x << 1) - ctr_x - davs2_sign2(mvd->x - ctr_x) * THRESHOLD_PMVR; mvy = mvp->y + (mvd->y << 1) + ctr_y; } else if (DAVS2_ABS(mvd->y - ctr_y) > THRESHOLD_PMVR) { mvx = mvp->x + (mvd->x << 1) + ctr_x; mvy = mvp->y + (mvd->y << 1) - ctr_y - davs2_sign2(mvd->y - ctr_y) * THRESHOLD_PMVR; } else { mvx = mvd->x + mvp->x; mvy = mvd->y + mvp->y; } } else { mvx = mvd->x + mvp->x; mvy = mvd->y + mvp->y; } mv->x = (int16_t)DAVS2_CLIP3(-32768, 32767, mvx); mv->y = (int16_t)DAVS2_CLIP3(-32768, 32767, mvy); } /* --------------------------------------------------------------------------- * get spatial neighboring MV */ static ALWAYS_INLINE void cu_get_neighbor_spatial(davs2_t *h, int cur_slice_idx, neighbor_inter_t *p_neighbor, int x4, int y4) { int b_outside_pic = y4 < 0 || y4 >= h->i_height_in_spu || x4 < 0 || x4 >= h->i_width_in_spu; int scu_xy = (y4 >> 1) * h->i_width_in_scu + (x4 >> 1); if (b_outside_pic || h->scu_data[scu_xy].i_slice_nr != cur_slice_idx) { p_neighbor->is_available = 0; p_neighbor->i_dir_pred = PDIR_INVALID; p_neighbor->ref_idx.r[0] = INVALID_REF; p_neighbor->ref_idx.r[1] = INVALID_REF; p_neighbor->mv[0].v = 0; p_neighbor->mv[1].v = 0; } else { const int w_in_4x4 = h->i_width_in_spu; const int pos = y4 * w_in_4x4 + x4; p_neighbor->is_available = 1; p_neighbor->i_dir_pred = h->p_dirpred[pos]; p_neighbor->ref_idx = h->p_ref_idx[pos]; p_neighbor->mv[0] = h->p_tmv_1st[pos]; p_neighbor->mv[1] = h->p_tmv_2nd[pos]; } } /* --------------------------------------------------------------------------- * get temporal MV predictor */ static ALWAYS_INLINE void cu_get_neighbor_temporal(davs2_t *h, neighbor_inter_t *p_neighbor, int x4, int y4) { int w_in_16x16 = (h->i_width_in_spu + 3) >> 2; int pos = (y4 /*>> 2*/) * w_in_16x16 + (x4 /*>> 2*/); p_neighbor->is_available = 1; p_neighbor->i_dir_pred = PDIR_FWD; p_neighbor->ref_idx.r[0] = h->fref[0]->refbuf[pos]; p_neighbor->mv[0] = h->fref[0]->mvbuf[pos]; p_neighbor->ref_idx.r[1] = INVALID_REF; p_neighbor->mv[1].v = 0; } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE int get_pu_type_for_mvp(int bsx, int bsy, int cu_pix_x, int cu_pix_y) { if (bsx < bsy) { if (cu_pix_x == 0) { return 1; } else { return 2; } } else if (bsx > bsy) { if (cu_pix_y == 0) { return 3; } else { return 4; } } return 0; // default } #define get_mvp_default FPFX(get_mvp_default) void get_mvp_default(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, mv_t *pmv, int bwd_2nd, int ref_frame, int bsx, int pu_type_for_mvp); #define fill_mv_and_ref_for_skip FPFX(fill_mv_and_ref_for_skip) void fill_mv_and_ref_for_skip(davs2_t *h, cu_t *p_cu, int pix_x, int pix_y, int size_in_scu); #ifdef __cplusplus } #endif #endif // DAVS2_PRED_H davs2-1.6/source/common/primitives.cc000066400000000000000000000046141337322544400177050ustar00rootroot00000000000000/* * primitives.cc * * Description of this file: * function handles initialize functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "primitives.h" #include "cpu.h" #include "intra.h" #include "mc.h" #include "transform.h" #include "quant.h" #include "deblock.h" #include "sao.h" #include "alf.h" /* --------------------------------------------------------------------------- */ ao_funcs_t gf_davs2 = {0}; /* --------------------------------------------------------------------------- */ void init_all_primitives(uint32_t cpuid) { if (gf_davs2.initial_count != 0) { // already initialed gf_davs2.initial_count++; return; } gf_davs2.initial_count = 1; gf_davs2.cpuid = cpuid; /* init function handles */ davs2_memory_init (cpuid, &gf_davs2); davs2_intra_pred_init(cpuid, &gf_davs2); davs2_pixel_init (cpuid, &gf_davs2); davs2_mc_init (cpuid, &gf_davs2); davs2_quant_init (cpuid, &gf_davs2); davs2_dct_init (cpuid, &gf_davs2); davs2_deblock_init (cpuid, &gf_davs2); davs2_sao_init (cpuid, &gf_davs2); davs2_alf_init (cpuid, &gf_davs2); } davs2-1.6/source/common/primitives.h000066400000000000000000000173001337322544400175430ustar00rootroot00000000000000/* * primitives.h * * Description of this file: * function handles initialize functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_PRIMITIVES_H #define DAVS2_PRIMITIVES_H #ifdef __cplusplus extern "C" { #endif /** * =========================================================================== * macros * =========================================================================== */ #if HIGH_BIT_DEPTH #define MC_PART_INDEX(width, height) (width >= 8) #else #define MC_PART_INDEX(width, height) (width > 8) #endif /** * =========================================================================== * function definitions and structures * =========================================================================== */ /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * function handle types */ typedef void(*block_copy_pp_t)(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); typedef void(*block_copy_sc_t)(coeff_t *dst, intptr_t i_dst, int16_t *src, intptr_t i_src, int w, int h); typedef void(*block_intpl_t)(const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx); typedef void(*block_intpl_ext_t)(const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdxX, int coeffIdxY); typedef void(*intpl_t) (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); typedef void(*intpl_ext_t)(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_x, const int8_t *coeff_y); typedef void(*pixel_avg_pp_t)(pel_t *dst, int i_dst, const pel_t *src0, int i_src0, const pel_t *src1, int i_src1, int width, int height); typedef void(*dct_t)(const coeff_t *src, coeff_t *dst, int i_src); typedef void(*intra_pred_t)(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); typedef void(*fill_edge_t)(const pel_t *p_topleft, int i_topleft, const pel_t *p_lcu_ep, pel_t *EP, uint32_t i_avail, int bsx, int bsy); typedef void *(*memcpy_t)(void *dst, const void *src, size_t n); typedef void(*copy_pp_t)(pel_t* dst, intptr_t dstStride, const pel_t* src, intptr_t srcStride); // dst is aligned typedef void(*copy_ss_t)(coeff_t* dst, intptr_t dstStride, const coeff_t* src, intptr_t srcStride); typedef void(*pixel_add_ps_t)(pel_t* dst, intptr_t dstride, const pel_t* b0, const coeff_t* b1, intptr_t sstride0, intptr_t sstride1); typedef void(*lcu_deblock_t)(davs2_t *h, davs2_frame_t *frm, int i_lcu_x, int i_lcu_y); typedef void(*sao_flt_bo_t)(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param); typedef void(*sao_flt_eo_t)(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); /* --------------------------------------------------------------------------- * assembly optimization functions */ typedef struct ao_funcs_t { ALIGN32(uint32_t initial_count); uint32_t cpuid; /* memory copy */ memcpy_t fast_memcpy; memcpy_t memcpy_aligned; void*(*fast_memzero) (void *dst, size_t n); void*(*memzero_aligned)(void *dst, size_t n); void*(*fast_memset) (void *dst, int val, size_t n); /* plane copy */ void(*plane_copy)(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); block_copy_pp_t block_copy; block_copy_sc_t block_coeff_copy; copy_pp_t copy_pp[MAX_PART_NUM]; copy_ss_t copy_ss[MAX_PART_NUM]; pixel_add_ps_t add_ps[MAX_PART_NUM]; /* block average */ pixel_avg_pp_t block_avg; /* interpolate */ #if USE_NEW_INTPL block_intpl_t block_intpl_luma_hor[MAX_PART_NUM]; block_intpl_t block_intpl_luma_ver[MAX_PART_NUM]; block_intpl_ext_t block_intpl_luma_ext[MAX_PART_NUM]; #endif intpl_t intpl_luma_ver[2][3];//[2]:ݿСк֣0:size<=8 1:size>=16 [3]:Ȩϵв intpl_t intpl_luma_hor[2][3]; intpl_ext_t intpl_luma_ext[2]; intpl_t intpl_chroma_ver[2]; intpl_t intpl_chroma_hor[2]; intpl_ext_t intpl_chroma_ext[2]; /* intra prediction */ intra_pred_t intraf[NUM_INTRA_MODE]; fill_edge_t fill_edge_f[4]; /* loop filter */ void(*set_deblock_const)(void); void(*deblock_luma[2]) (pel_t *src, int stride, int alpha, int beta, uint8_t *flt_flag); #if HDR_CHROMA_DELTA_QP void(*deblock_chroma[2])(pel_t *src_u, pel_t *src_v, int stride, int *alpha, int *beta, uint8_t *flt_flag); #else void(*deblock_chroma[2])(pel_t *src_u, pel_t *src_v, int stride, int alpha, int beta, uint8_t *flt_flag); #endif /* SAO filter */ sao_flt_bo_t sao_block_bo; /* filter for bo type */ sao_flt_eo_t sao_filter_eo[4]; /* SAO filter for eo types */ /* alf */ void(*alf_block[2])(pel_t *p_dst, const pel_t *p_src, int stride, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail); /* dct */ dct_t idct[MAX_PART_NUM][DCT_PATTERN_NUM]; /* sqrt dct */ /* 2nd transform */ void(*inv_transform_4x4_2nd)(coeff_t *coeff, int i_coeff); void(*inv_transform_2nd) (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); /* quant */ void(*dequant)(coeff_t *coef, const int i_coef, const int scale, const int shift); } ao_funcs_t; extern ao_funcs_t gf_davs2; /** * =========================================================================== * interface function declares * =========================================================================== */ #define init_all_primitives FPFX(init_all_primitives) void init_all_primitives(uint32_t cpuid); /* --------------------------------------------------------------------------- * extern functions */ #define davs2_mc_init FPFX(mc_init) void davs2_mc_init (uint32_t cpuid, ao_funcs_t *pf); #define davs2_pixel_init FPFX(pixel_init) void davs2_pixel_init (uint32_t cpuid, ao_funcs_t* pixf); #define davs2_memory_init FPFX(memory_init) void davs2_memory_init(uint32_t cpuid, ao_funcs_t* pixf); #ifdef __cplusplus } #endif #endif // DAVS2_PRIMITIVES_H davs2-1.6/source/common/quant.cc000066400000000000000000000240501337322544400166360ustar00rootroot00000000000000/* * quant.c * * Description of this file: * Quant functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "quant.h" #include "vec/intrinsic.h" /* --------------------------------------------------------------------------- */ const int16_t wq_param_default[2][6] = { { 67, 71, 71, 80, 80, 106}, { 64, 49, 53, 58, 58, 64 } }; /* --------------------------------------------------------------------------- */ static const int g_WqMDefault4x4[16] = { 64, 64, 64, 68, 64, 64, 68, 72, 64, 68, 76, 80, 72, 76, 84, 96 }; /* --------------------------------------------------------------------------- */ static const int g_WqMDefault8x8[64] = { 64, 64, 64, 64, 68, 68, 72, 76, 64, 64, 64, 68, 72, 76, 84, 92, 64, 64, 68, 72, 76, 80, 88, 100, 64, 68, 72, 80, 84, 92, 100, 28, 68, 72, 80, 84, 92, 104, 112, 128, 76, 80, 84, 92, 104, 116, 132, 152, 96, 100, 104, 116, 124, 140, 164, 188, 104, 108, 116, 128, 152, 172, 192, 216 }; /* --------------------------------------------------------------------------- */ static const uint8_t WeightQuantModel[4][64] = { // l a b c d h // 0 1 2 3 4 5 { // Mode 0 0, 0, 0, 4, 4, 4, 5, 5, 0, 0, 3, 3, 3, 3, 5, 5, 0, 3, 2, 2, 1, 1, 5, 5, 4, 3, 2, 2, 1, 5, 5, 5, 4, 3, 1, 1, 5, 5, 5, 5, 4, 3, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { // Mode 1 0, 0, 0, 4, 4, 4, 5, 5, 0, 0, 4, 4, 4, 4, 5, 5, 0, 3, 2, 2, 2, 1, 5, 5, 3, 3, 2, 2, 1, 5, 5, 5, 3, 3, 2, 1, 5, 5, 5, 5, 3, 3, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { // Mode 2 0, 0, 0, 4, 4, 3, 5, 5, 0, 0, 4, 4, 3, 2, 5, 5, 0, 4, 4, 3, 2, 1, 5, 5, 4, 4, 3, 2, 1, 5, 5, 5, 4, 3, 2, 1, 5, 5, 5, 5, 3, 2, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { // Mode 3 0, 0, 0, 3, 2, 1, 5, 5, 0, 0, 4, 3, 2, 1, 5, 5, 0, 4, 4, 3, 2, 1, 5, 5, 3, 3, 3, 3, 2, 5, 5, 5, 2, 2, 2, 2, 5, 5, 5, 5, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 } }; /* --------------------------------------------------------------------------- */ static const uint8_t WeightQuantModel4x4[4][16] = { // l a b c d h // 0 1 2 3 4 5 { // Mode 0 0, 4, 3, 5, 4, 2, 1, 5, 3, 1, 1, 5, 5, 5, 5, 5 }, { // Mode 1 0, 4, 4, 5, 3, 2, 2, 5, 3, 2, 1, 5, 5, 5, 5, 5 }, { // Mode 2 0, 4, 3, 5, 4, 3, 2, 5, 3, 2, 1, 5, 5, 5, 5, 5 }, { // Mode 3 0, 3, 1, 5, 3, 4, 2, 5, 1, 2, 2, 5, 5, 5, 5, 5 } }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ const int *wq_get_default_matrix(int sizeId) { return (sizeId == 0) ? g_WqMDefault4x4 : g_WqMDefault8x8; } /* --------------------------------------------------------------------------- */ void wq_init_frame_quant_param(davs2_t *h) { weighted_quant_t *p = &h->wq; int uiWQMSizeId; int i, j, k; assert(h->seq_info.enable_weighted_quant); for (uiWQMSizeId = 0; uiWQMSizeId < 4; uiWQMSizeId++) { for (i = 0; i < 64; i++) { p->cur_wq_matrix[uiWQMSizeId][i] = 1 << 7; } } for (i = 0; i < 2; i++) { for (j = 0; j < 6; j++) { p->wquant_param[i][j] = 128; } } if (p->wq_param == 0) { for (i = 0; i < 6; i++) { p->wquant_param[DETAILED][i] = wq_param_default[DETAILED][i]; } } else if (p->wq_param == 1) { for (i = 0; i < 6; i++) { p->wquant_param[UNDETAILED][i] = p->quant_param_undetail[i]; } } if (p->wq_param == 2) { for (i = 0; i < 6; i++) { p->wquant_param[DETAILED][i] = p->quant_param_detail[i]; } } // reconstruct the weighting matrix for (k = 0; k < 2; k++) { for (j = 0; j < 8; j++) { for (i = 0; i < 8; i++) { p->wq_matrix[1][k][j * 8 + i] = p->wquant_param[k][WeightQuantModel[p->wq_model][j * 8 + i]]; } } } for (k = 0; k < 2; k++) { for (j = 0; j < 4; j++) { for (i = 0; i < 4; i++) { p->wq_matrix[0][k][j * 4 + i] = p->wquant_param[k][WeightQuantModel4x4[p->wq_model][j * 4 + i]]; } } } } /* --------------------------------------------------------------------------- */ void wq_update_frame_matrix(davs2_t *h) { weighted_quant_t *p = &h->wq; int uiWQMSizeId, uiWMQId; int uiBlockSize; int i; assert(h->seq_info.enable_weighted_quant); for (uiWQMSizeId = 0; uiWQMSizeId < 4; uiWQMSizeId++) { uiBlockSize = DAVS2_MIN(1 << (uiWQMSizeId + 2), 8); uiWMQId = (uiWQMSizeId < 2) ? uiWQMSizeId : 1; if (p->pic_wq_data_index == 0) { for (i = 0; i < (uiBlockSize * uiBlockSize); i++) { p->cur_wq_matrix[uiWQMSizeId][i] = p->seq_wq_matrix[uiWMQId][i]; } } else if (p->pic_wq_data_index == 1) { if (p->wq_param == 0) { for (i = 0; i < (uiBlockSize * uiBlockSize); i++) { p->cur_wq_matrix[uiWQMSizeId][i] = p->wq_matrix[uiWMQId][DETAILED][i];// detailed weighted matrix } } else if (p->wq_param == 1) { for (i = 0; i < (uiBlockSize * uiBlockSize); i++) { p->cur_wq_matrix[uiWQMSizeId][i] = p->wq_matrix[uiWMQId][0][i]; // undetailed weighted matrix } } if (p->wq_param == 2) { for (i = 0; i < (uiBlockSize * uiBlockSize); i++) { p->cur_wq_matrix[uiWQMSizeId][i] = p->wq_matrix[uiWMQId][1][i]; // detailed weighted matrix } } } else if (p->pic_wq_data_index == 2) { for (i = 0; i < (uiBlockSize * uiBlockSize); i++) { p->cur_wq_matrix[uiWQMSizeId][i] = p->pic_user_wq_matrix[uiWMQId][i]; } } } } /* --------------------------------------------------------------------------- */ static void dequant_c(coeff_t *p_coeff, const int i_coef, const int scale, const int shift) { const int add = (1 << (shift - 1)); int i; for (i = 0; i < i_coef; i++) { if (p_coeff[i]) { p_coeff[i] = (coeff_t)DAVS2_CLIP3(-32768, 32767, (p_coeff[i] * scale + add) >> shift); } } } /* --------------------------------------------------------------------------- */ static void dequant_weighted_c(coeff_t *p_coeff, int i_coeff, int bsx, int bsy, int scale, int shift, int16_t *wq_matrix, int wqm_shift, int wqm_size_id) { const int add = 1 << (shift - 1); const int wqm_size = 1 << (wqm_size_id + 2); const int stride_shift = DAVS2_CLIP3(0, 2, wqm_size_id - 1); const int stride = wqm_size >> stride_shift; int i, j; for (j = 0; j < bsy; j++) { for (i = 0; i < bsx; i++) { int wqm_coef = wq_matrix[((j >> stride_shift) & (stride - 1)) * stride + ((i >> stride_shift) & (stride - 1))]; if (p_coeff[i]) { int cur_coeff = (((((p_coeff[i] * wqm_coef) >> wqm_shift) * scale) >> 4) + add) >> shift; p_coeff[i] = (coeff_t)DAVS2_CLIP3(-32768, 32767, cur_coeff); } } p_coeff += i_coeff; } } /* --------------------------------------------------------------------------- * dequant the coefficients */ void dequant_coeffs(davs2_t *h, coeff_t *p_coeff, int bsx, int bsy, int scale, int shift, int WQMSizeId) { if (h->seq_info.enable_weighted_quant) { int wqm_shift = (h->wq.pic_wq_data_index == 1) ? 3 : 0; dequant_weighted_c(p_coeff, bsx, bsx, bsy, scale, shift, h->wq.cur_wq_matrix[WQMSizeId], wqm_shift, WQMSizeId); } else { gf_davs2.dequant(p_coeff, bsx * bsy, scale, shift); } } /* --------------------------------------------------------------------------- */ void davs2_quant_init(uint32_t cpuid, ao_funcs_t *fh) { /* init c function handles */ fh->dequant = dequant_c; /* init asm function handles */ #if HAVE_MMX if (cpuid & DAVS2_CPU_SSE4) { fh->dequant = davs2_dequant_sse4; } #endif // if HAVE_MMX } davs2-1.6/source/common/quant.h000066400000000000000000000076151337322544400165100ustar00rootroot00000000000000/* * quant.h * * Description of this file: * Quant functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_QUANT_H #define DAVS2_QUANT_H #ifdef __cplusplus extern "C" { #endif #define QP_SCALE_CR FPFX(QP_SCALE_CR) extern const uint8_t QP_SCALE_CR[]; #define IQ_SHIFT FPFX(IQ_SHIFT) extern const int16_t IQ_SHIFT[]; #define IQ_TAB FPFX(IQ_TAB) extern const uint16_t IQ_TAB[]; #define wq_param_default FPFX(wq_param_default) extern const int16_t wq_param_default[2][6]; /** * --------------------------------------------------------------------------- * Weight Quant * - Adaptive Frequency Weighting Quantization, include: * a). Frequency weighting model, quantization * b). Picture level user-defined frequency weighting * c). LCU level adaptive frequency weighting mode decision * According to adopted proposals: m1878, m2148, m2331 * --------------------------------------------------------------------------- */ #define PARAM_NUM 6 #define WQ_MODEL_NUM 3 #define UNDETAILED 0 #define DETAILED 1 #define WQ_MODE_F 0 #define WQ_MODE_U 1 #define WQ_MODE_D 2 #define wq_get_default_matrix FPFX(wq_get_default_matrix) const int *wq_get_default_matrix(int sizeId); #define wq_init_frame_quant_param FPFX(wq_init_frame_quant_param) void wq_init_frame_quant_param(davs2_t *h); #define wq_update_frame_matrix FPFX(wq_update_frame_matrix) void wq_update_frame_matrix(davs2_t *h); /* dequant */ #define dequant_coeffs FPFX(dequant_coeffs) void dequant_coeffs(davs2_t *h, coeff_t *p_coeff, int bsx, int bsy, int scale, int shift, int WQMSizeId); #define davs2_quant_init FPFX(quant_init) void davs2_quant_init(uint32_t cpuid, ao_funcs_t *fh); /* --------------------------------------------------------------------------- * get qp in chroma component */ static ALWAYS_INLINE int cu_get_chroma_qp(davs2_t * h, int luma_qp, int uv) { int qp = luma_qp + (uv == 0 ? h->chroma_quant_param_delta_u : h->chroma_quant_param_delta_v); #if HIGH_BIT_DEPTH const int bit_depth_offset = ((h->sample_bit_depth - 8) << 3); qp -= bit_depth_offset; qp = qp < 0 ? qp : QP_SCALE_CR[qp]; qp = DAVS2_CLIP3(0, 63 + bit_depth_offset, qp + bit_depth_offset); #else qp = QP_SCALE_CR[DAVS2_CLIP3(0, 63, qp)]; #endif return qp; } /* --------------------------------------------------------------------------- * get quant parameters */ static ALWAYS_INLINE void cu_get_quant_params(davs2_t * h, int qp, int bit_size, int *shift, int *scale) { *shift = IQ_SHIFT[qp] + (h->sample_bit_depth + 1) + bit_size - LIMIT_BIT; *scale = IQ_TAB[qp]; } #ifdef __cplusplus } #endif #endif // DAVS2_QUANT_H davs2-1.6/source/common/sao.cc000066400000000000000000000637331337322544400163030ustar00rootroot00000000000000/* * sao.cc * * Description of this file: * SAO functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "sao.h" #include "aec.h" #include "frame.h" #include "vec/intrinsic.h" #if defined(_MSC_VER) || defined(__ICL) #pragma warning(disable: 4204) // nonstandard extension used: non-constant aggregate initializer #endif /** * =========================================================================== * local & global variables (const tables) * =========================================================================== */ const int saoclip[NUM_SAO_OFFSET][3] = { //EO { -1, 6, 7 }, // low bound, upper bound, threshold { 0, 1, 1 }, { 0, 0, 0 }, { -1, 0, 1 }, { -6, 1, 7 }, { -7, 7, 7 } // BO }; /* --------------------------------------------------------------------------- * lcu neighbor */ enum lcu_neighbor_e { SAO_T = 0, /* top */ SAO_D = 1, /* down */ SAO_L = 2, /* left */ SAO_R = 3, /* right */ SAO_TL = 4, /* top-left */ SAO_TR = 5, /* top-right */ SAO_DL = 6, /* down-left */ SAO_DR = 7 /* down-right */ }; typedef struct sao_region_t { int pix_x[IMG_COMPONENTS]; /* start pixel position in x */ int pix_y[IMG_COMPONENTS]; /* start pixel position in y */ int width[IMG_COMPONENTS]; /* */ int height[IMG_COMPONENTS]; /* */ /* availabilities of neighboring blocks */ int8_t b_left; int8_t b_top_left; int8_t b_top; int8_t b_top_right; int8_t b_right; int8_t b_right_down; int8_t b_down; int8_t b_down_left; } sao_region_t; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void sao_init_param(sao_t *lcu_sao) { int i; for (i = 0; i < IMG_COMPONENTS; i++) { lcu_sao->planes[i].modeIdc = SAO_MODE_OFF; lcu_sao->planes[i].typeIdc = -1; lcu_sao->planes[i].startBand = -1; lcu_sao->planes[i].startBand2 = -1; memset(lcu_sao->planes[i].offset, 0, MAX_NUM_SAO_CLASSES * sizeof(int)); } } /* --------------------------------------------------------------------------- */ static ALWAYS_INLINE void sao_copy_param(sao_t *dst, sao_t *src) { memcpy(dst, src, sizeof(sao_t)); } /* --------------------------------------------------------------------------- */ static void sao_block_eo_0_c(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { const int max_pel_val = (1 << bit_depth) - 1; int left_sign, right_sign; int edge_type; int x, y; int pel_diff; int sx = lcu_avail[SAO_L] ? 0 : 1; int ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); for (y = 0; y < i_block_h; y++) { pel_diff = p_src[sx] - p_src[sx - 1]; left_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0); for (x = sx; x < ex; x++) { pel_diff = p_src[x] - p_src[x + 1]; right_sign = pel_diff > 0? 1 : (pel_diff < 0? -1 : 0); edge_type = left_sign + right_sign + 2; left_sign = -right_sign; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void sao_block_eo_90_c(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { const int max_pel_val = (1 << bit_depth) - 1; int edge_type; int x, y; int sy = lcu_avail[SAO_T] ? 0 : 1; int ey = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); for (x = 0; x < i_block_w; x++) { int pel_diff = p_src[sy * i_src + x] - p_src[(sy - 1) * i_src + x]; int top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); for (y = sy; y < ey; y++) { int pel_diff = p_src[y * i_src + x] - p_src[(y + 1) * i_src + x]; int down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + top_sign + 2; top_sign = -down_sign; p_dst[y * i_dst + x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[y * i_src + x] + sao_offset[edge_type]); } } } /* --------------------------------------------------------------------------- */ static void sao_block_eo_135_c(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int8_t SIGN_BUF[MAX_CU_SIZE + 32]; // sign of top line int8_t *UPROW_S = SIGN_BUF + 16; const int max_pel_val = (1 << bit_depth) - 1; int reg = 0; int sx, ex; // start/end (x, y) int sx_0, ex_0, sx_n, ex_n; // start/end x for first and last row int top_sign, down_sign; int edge_type; int pel_diff; int x, y; sx = lcu_avail[SAO_L] ? 0 : 1; ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); // init the line buffer for (x = sx; x < ex; x++) { pel_diff = p_src[i_src + x + 1] - p_src[x]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x + 1] = (int8_t)top_sign; } // first row sx_0 = lcu_avail[SAO_TL] ? 0 : 1; ex_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; for (x = sx_0; x < ex_0; x++) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x + 1] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows for (y = 1; y < i_block_h - 1; y++) { p_src += i_src; p_dst += i_dst; for (x = sx; x < ex; x++) { if (x == sx) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x] = (int8_t)reg; reg = -down_sign; } } // last row sx_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); ex_n = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); p_src += i_src; p_dst += i_dst; for (x = sx_n; x < ex_n; x++) { if (x == sx) { pel_diff = p_src[x] - p_src[-i_src + x - 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x + 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } } /* --------------------------------------------------------------------------- */ static void sao_block_eo_45_c(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int8_t SIGN_BUF[MAX_CU_SIZE + 32]; // sign of top line int8_t *UPROW_S = SIGN_BUF + 16; const int max_pel_val = (1 << bit_depth) - 1; int sx, ex; // start/end (x, y) int sx_0, ex_0, sx_n, ex_n; // start/end x for first and last row int top_sign, down_sign; int edge_type; int pel_diff; int x, y; sx = lcu_avail[SAO_L] ? 0 : 1; ex = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); // init the line buffer for (x = sx; x < ex; x++) { pel_diff = p_src[i_src + x - 1] - p_src[x]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x - 1] = (int8_t)top_sign; } // first row sx_0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); ex_0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); for (x = sx_0; x < ex_0; x++) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = top_sign - UPROW_S[x - 1] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } // middle rows for (y = 1; y < i_block_h - 1; y++) { p_src += i_src; p_dst += i_dst; for (x = sx; x < ex; x++) { if (x == ex - 1) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); UPROW_S[x - 1] = (int8_t)(-down_sign); } } // last row sx_n = lcu_avail[SAO_DL] ? 0 : 1; ex_n = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; p_src += i_src; p_dst += i_dst; for (x = sx_n; x < ex_n; x++) { if (x == ex - 1) { pel_diff = p_src[x] - p_src[-i_src + x + 1]; top_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); UPROW_S[x] = (int8_t)top_sign; } pel_diff = p_src[x] - p_src[i_src + x - 1]; down_sign = pel_diff > 0 ? 1 : (pel_diff < 0 ? -1 : 0); edge_type = down_sign + UPROW_S[x] + 2; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } } /* --------------------------------------------------------------------------- */ static void sao_block_bo_c(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param) { const int max_pel_val = (1 << bit_depth) - 1; const int *sao_offset = sao_param->offset; int edge_type; int x, y; const int band_shift = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x++) { edge_type = p_src[x] >> band_shift; p_dst[x] = (pel_t)DAVS2_CLIP3(0, max_pel_val, p_src[x] + sao_offset[edge_type]); } p_src += i_src; p_dst += i_dst; } } /* --------------------------------------------------------------------------- */ static void sao_read_lcu(davs2_t *h, int lcu_xy, bool_t *slice_sao_on, sao_t *cur_sao_param) { const int w_in_scu = h->i_width_in_scu; const int scu_x = h->lcu.i_scu_x; const int scu_y = h->lcu.i_scu_y; const int scu_xy = h->lcu.i_scu_xy; int merge_mode = 0; int merge_top_avail, merge_left_avail; /* neighbor available? */ merge_top_avail = (scu_y == 0) ? 0 : (h->scu_data[scu_xy].i_slice_nr == h->scu_data[scu_xy - w_in_scu].i_slice_nr); merge_left_avail = (scu_x == 0) ? 0 : (h->scu_data[scu_xy].i_slice_nr == h->scu_data[scu_xy - 1].i_slice_nr); if (merge_left_avail || merge_top_avail) { merge_mode = aec_read_sao_mergeflag(&h->aec, merge_left_avail, merge_top_avail); } if (merge_mode) { if (merge_mode == 2) { sao_copy_param(cur_sao_param, &h->lcu_infos[lcu_xy - 1].sao_param); // copy left } else { assert(merge_mode == 1); sao_copy_param(cur_sao_param, &h->lcu_infos[lcu_xy - h->i_width_in_lcu].sao_param); // copy above } } else { int offset[4]; int stBnd[2]; int db_temp; int sao_mode, sao_type; int i; for (i = 0; i < IMG_COMPONENTS; i++) { if (!slice_sao_on[i]) { cur_sao_param->planes[i].modeIdc = SAO_MODE_OFF; } else { sao_mode = aec_read_sao_mode(&h->aec); switch (sao_mode) { case 0: cur_sao_param->planes[i].modeIdc = SAO_MODE_OFF; break; case 1: cur_sao_param->planes[i].modeIdc = SAO_MODE_NEW; cur_sao_param->planes[i].typeIdc = SAO_TYPE_BO; break; case 3: cur_sao_param->planes[i].modeIdc = SAO_MODE_NEW; cur_sao_param->planes[i].typeIdc = SAO_TYPE_EO_0; break; default: assert(1); break; } if (cur_sao_param->planes[i].modeIdc == SAO_MODE_NEW) { aec_read_sao_offsets(&h->aec, &cur_sao_param->planes[i], offset); sao_type = aec_read_sao_type(&h->aec, &cur_sao_param->planes[i]); if (cur_sao_param->planes[i].typeIdc == SAO_TYPE_BO) { memset(cur_sao_param->planes[i].offset, 0, MAX_NUM_SAO_CLASSES * sizeof(int)); db_temp = sao_type >> NUM_SAO_BO_CLASSES_LOG2; stBnd[0] = sao_type - (db_temp << NUM_SAO_BO_CLASSES_LOG2); stBnd[1] = (stBnd[0] + db_temp) % 32; cur_sao_param->planes[i].startBand = stBnd[0]; cur_sao_param->planes[i].startBand2 = stBnd[1]; cur_sao_param->planes[i].offset[(stBnd[0] ) ] = offset[0]; cur_sao_param->planes[i].offset[(stBnd[0] + 1) % 32] = offset[1]; cur_sao_param->planes[i].offset[(stBnd[1] ) ] = offset[2]; cur_sao_param->planes[i].offset[(stBnd[1] + 1) % 32] = offset[3]; //memcpy(cur_sao_param->planes[i].offset, offset, 4 * sizeof(int)); } else { assert(cur_sao_param->planes[i].typeIdc == SAO_TYPE_EO_0); cur_sao_param->planes[i].typeIdc = sao_type; cur_sao_param->planes[i].offset[SAO_CLASS_EO_FULL_VALLEY] = offset[0]; cur_sao_param->planes[i].offset[SAO_CLASS_EO_HALF_VALLEY] = offset[1]; cur_sao_param->planes[i].offset[SAO_CLASS_EO_PLAIN ] = 0; cur_sao_param->planes[i].offset[SAO_CLASS_EO_HALF_PEAK ] = offset[2]; cur_sao_param->planes[i].offset[SAO_CLASS_EO_FULL_PEAK ] = offset[3]; } } } } } } /* --------------------------------------------------------------------------- */ void sao_read_lcu_param(davs2_t *h, int lcu_xy, bool_t *slice_sao_on, sao_t *sao_param) { if (slice_sao_on[0] || slice_sao_on[1] || slice_sao_on[2]) { sao_read_lcu(h, lcu_xy, slice_sao_on, sao_param); } else { sao_init_param(sao_param); } } /* --------------------------------------------------------------------------- */ static void sao_get_neighbor_avail(davs2_t *h, sao_region_t *p_avail, int i_lcu_x, int i_lcu_y) { int i_lcu_level = h->i_lcu_level; int pix_x = i_lcu_x << i_lcu_level; int pix_y = i_lcu_y << i_lcu_level; int width = DAVS2_MIN(1 << i_lcu_level, h->i_width - pix_x); int height = DAVS2_MIN(1 << i_lcu_level, h->i_height - pix_y); int pix_x_c = pix_x >> 1; int chroma_v_shift = (h->i_chroma_format == CHROMA_420); int pix_y_c = pix_y >> chroma_v_shift; int width_c = width >> 1; int height_c = height >> 1; /* Իȡ */ p_avail->b_left = i_lcu_x != 0; p_avail->b_top = i_lcu_y != 0; p_avail->b_right = (i_lcu_x < h->i_width_in_lcu - 1); p_avail->b_down = (i_lcu_y < h->i_height_in_lcu - 1); if (h->seq_info.cross_loop_filter_flag == FALSE) { int scu_x = i_lcu_x << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); int scu_y = i_lcu_y << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); if (p_avail->b_top) { p_avail->b_top = h->scu_data[scu_y * h->i_width_in_scu + scu_x].i_slice_nr == h->scu_data[(scu_y - 1) * h->i_width_in_scu + scu_x].i_slice_nr; } if (p_avail->b_down) { scu_y += 1 << (h->i_lcu_level - MIN_CU_SIZE_IN_BIT); p_avail->b_down = h->scu_data[scu_y * h->i_width_in_scu + scu_x].i_slice_nr == h->scu_data[(scu_y - 1) * h->i_width_in_scu + scu_x].i_slice_nr; } } p_avail->b_top_left = p_avail->b_top && p_avail->b_left; p_avail->b_top_right = p_avail->b_top && p_avail->b_right; p_avail->b_down_left = p_avail->b_down && p_avail->b_left; p_avail->b_right_down = p_avail->b_down && p_avail->b_right; /* ˲ĵ */ if (!p_avail->b_right) { width += SAO_SHIFT_PIX_NUM; width_c += SAO_SHIFT_PIX_NUM; } if (!p_avail->b_down) { height += SAO_SHIFT_PIX_NUM; height_c += SAO_SHIFT_PIX_NUM; } if (p_avail->b_left) { pix_x -= SAO_SHIFT_PIX_NUM; pix_x_c -= SAO_SHIFT_PIX_NUM; } else { width -= SAO_SHIFT_PIX_NUM; width_c -= SAO_SHIFT_PIX_NUM; } if (p_avail->b_top) { pix_y -= SAO_SHIFT_PIX_NUM; pix_y_c -= SAO_SHIFT_PIX_NUM; } else { height -= SAO_SHIFT_PIX_NUM; height_c -= SAO_SHIFT_PIX_NUM; } /* make sure the width and height is not outside a picture */ width = DAVS2_MIN(width, h->i_width - pix_x); width_c = DAVS2_MIN(width_c, (h->i_width >> 1) - pix_x_c); height = DAVS2_MIN(height, h->i_height - pix_y); height_c = DAVS2_MIN(height_c, (h->i_height >> 1) - pix_y_c); /* luma component */ p_avail->pix_x[0] = pix_x; p_avail->pix_y[0] = pix_y; p_avail->width[0] = width; p_avail->height[0] = height; /* chroma components */ p_avail->pix_x[1] = p_avail->pix_x[2] = pix_x_c; p_avail->pix_y[1] = p_avail->pix_y[2] = pix_y_c; p_avail->width[1] = p_avail->width[2] = width_c; p_avail->height[1] = p_avail->height[2] = height_c; } /* --------------------------------------------------------------------------- */ void sao_lcu(davs2_t *h, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_x, int i_lcu_y) { const int width_in_lcu = h->i_width_in_lcu; sao_t *lcu_param = &h->lcu_infos[i_lcu_y * width_in_lcu + i_lcu_x].sao_param; /* copy one decoded LCU */ davs2_frame_copy_lcu(h, p_tmp_frm, p_dec_frm, i_lcu_x, i_lcu_y, 0, 0); /* SAO one LCU */ sao_region_t region; int comp_idx; sao_get_neighbor_avail(h, ®ion, i_lcu_x, i_lcu_y); for (comp_idx = 0; comp_idx < IMG_COMPONENTS; comp_idx++) { if (h->slice_sao_on[comp_idx] == 0 || lcu_param->planes[comp_idx].modeIdc == SAO_MODE_OFF) { continue; } int filter_type = lcu_param->planes[comp_idx].typeIdc; assert(filter_type >= SAO_TYPE_EO_0 && filter_type <= SAO_TYPE_BO); int pix_y = region.pix_y[comp_idx]; int pix_x = region.pix_x[comp_idx]; const int bit_depth = h->sample_bit_depth; int blkoffset = pix_y * p_dec_frm->i_stride[comp_idx] + pix_x; pel_t *dst = p_dec_frm->planes[comp_idx] + blkoffset; pel_t *src = p_tmp_frm->planes[comp_idx] + blkoffset; if (filter_type == SAO_TYPE_BO) { sao_block_bo_c(dst, p_dec_frm->i_stride[comp_idx], src, p_dec_frm->i_stride[comp_idx], region.width[comp_idx], region.height[comp_idx], bit_depth, &lcu_param->planes[comp_idx]); } else { int avail[8]; avail[0] = region.b_top; avail[1] = region.b_down; avail[2] = region.b_left; avail[3] = region.b_right; avail[4] = region.b_top_left; avail[5] = region.b_top_right; avail[6] = region.b_down_left; avail[7] = region.b_right_down; gf_davs2.sao_filter_eo[filter_type](dst, p_dec_frm->i_stride[comp_idx], src, p_dec_frm->i_stride[comp_idx], region.width[comp_idx], region.height[comp_idx], bit_depth, avail, lcu_param->planes[comp_idx].offset); } } } /* --------------------------------------------------------------------------- */ void sao_lcurow(davs2_t *h, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_y) { const int width_in_lcu = h->i_width_in_lcu; int lcu_xy = i_lcu_y * width_in_lcu; int lcu_x; /* copy one decoded LCU-row */ davs2_frame_copy_lcurow(h, p_tmp_frm, p_dec_frm, i_lcu_y, -4, 0); /* SAO one LCU-row */ for (lcu_x = 0; lcu_x < h->i_width_in_lcu; lcu_x++) { sao_region_t region; sao_t *lcu_param = &h->lcu_infos[lcu_xy++].sao_param; int comp_idx; sao_get_neighbor_avail(h, ®ion, lcu_x, i_lcu_y); for (comp_idx = 0; comp_idx < IMG_COMPONENTS; comp_idx++) { if (h->slice_sao_on[comp_idx] == 0 || lcu_param->planes[comp_idx].modeIdc == SAO_MODE_OFF){ continue; } int filter_type = lcu_param->planes[comp_idx].typeIdc; assert(filter_type >= SAO_TYPE_EO_0 && filter_type <= SAO_TYPE_BO); int pix_y = region.pix_y[comp_idx]; int pix_x = region.pix_x[comp_idx]; const int bit_depth = h->sample_bit_depth; int blkoffset = pix_y * p_dec_frm->i_stride[comp_idx] + pix_x; pel_t *dst = p_dec_frm->planes[comp_idx] + blkoffset; pel_t *src = p_tmp_frm->planes[comp_idx] + blkoffset; if (filter_type == SAO_TYPE_BO) { gf_davs2.sao_block_bo(dst, p_dec_frm->i_stride[comp_idx], src, p_dec_frm->i_stride[comp_idx], region.width[comp_idx], region.height[comp_idx], bit_depth, &lcu_param->planes[comp_idx]); } else { int avail[8]; avail[0] = region.b_top; avail[1] = region.b_down; avail[2] = region.b_left; avail[3] = region.b_right; avail[4] = region.b_top_left; avail[5] = region.b_top_right; avail[6] = region.b_down_left; avail[7] = region.b_right_down; gf_davs2.sao_filter_eo[filter_type](dst, p_dec_frm->i_stride[comp_idx], src, p_dec_frm->i_stride[comp_idx], region.width[comp_idx], region.height[comp_idx], bit_depth, avail, lcu_param->planes[comp_idx].offset); } } } } /* --------------------------------------------------------------------------- */ void davs2_sao_init(uint32_t cpuid, ao_funcs_t *fh) { /* init c function handles */ fh->sao_block_bo = sao_block_bo_c; fh->sao_filter_eo[SAO_TYPE_EO_0] = sao_block_eo_0_c; fh->sao_filter_eo[SAO_TYPE_EO_45] = sao_block_eo_45_c; fh->sao_filter_eo[SAO_TYPE_EO_90] = sao_block_eo_90_c; fh->sao_filter_eo[SAO_TYPE_EO_135] = sao_block_eo_135_c; /* init asm function handles */ #if HAVE_MMX if (cpuid & DAVS2_CPU_SSE4) { fh->sao_block_bo = SAO_on_block_bo_sse128; fh->sao_filter_eo[SAO_TYPE_EO_0] = SAO_on_block_eo_0_sse128; fh->sao_filter_eo[SAO_TYPE_EO_45] = SAO_on_block_eo_45_sse128; fh->sao_filter_eo[SAO_TYPE_EO_90] = SAO_on_block_eo_90_sse128; fh->sao_filter_eo[SAO_TYPE_EO_135] = SAO_on_block_eo_135_sse128; } if (cpuid & DAVS2_CPU_AVX2) { fh->sao_block_bo = SAO_on_block_bo_avx2; fh->sao_filter_eo[SAO_TYPE_EO_0] = SAO_on_block_eo_0_avx2; fh->sao_filter_eo[SAO_TYPE_EO_45] = SAO_on_block_eo_45_avx2; fh->sao_filter_eo[SAO_TYPE_EO_90] = SAO_on_block_eo_90_avx2; fh->sao_filter_eo[SAO_TYPE_EO_135] = SAO_on_block_eo_135_avx2; } #endif } davs2-1.6/source/common/sao.h000066400000000000000000000036441337322544400161400ustar00rootroot00000000000000/* * sao.h * * Description of this file: * SAO functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_SAO_H #define DAVS2_SAO_H #ifdef __cplusplus extern "C" { #endif #define sao_read_lcu_param FPFX(sao_read_lcu_param) void sao_read_lcu_param(davs2_t *h, int lcu_xy, bool_t *slice_sao_on, sao_t *sao_param); #define sao_lcu FPFX(sao_lcu) void sao_lcu(davs2_t *h, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_x, int i_lcu_y); #define sao_lcurow FPFX(sao_lcurow) void sao_lcurow(davs2_t *h, davs2_frame_t *p_tmp_frm, davs2_frame_t *p_dec_frm, int i_lcu_y); #define davs2_sao_init FPFX(sao_init) void davs2_sao_init(uint32_t cpuid, ao_funcs_t *fh); #ifdef __cplusplus } #endif #endif // DAVS2_SAO_H davs2-1.6/source/common/scantab.h000066400000000000000000000746741337322544400170040ustar00rootroot00000000000000/* * scantab.h * * Description of this file: * tAVS2 scan tables of the davs2 library (this file is ONLY included by aec.c) * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_SCAN_TAB_H #define DAVS2_SCAN_TAB_H #ifdef __cplusplus extern "C" { #endif /** * =========================================================================== * global variables (const tables) * =========================================================================== */ /* --------------------------------------------------------------------------- */ static const int16_t tab_scan_2x2[4][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } }; static const int16_t tab_scan_4x4[16][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 3, 2 }, { 2, 3 }, { 3, 3 } }; static const int16_t tab_scan_8x8[64][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 0, 4 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 3, 2 }, { 2, 3 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 2, 4 }, { 3, 3 }, { 4, 2 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 5, 2 }, { 4, 3 }, { 3, 4 }, { 2, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 }, { 2, 6 }, { 3, 5 }, { 4, 4 }, { 5, 3 }, { 6, 2 }, { 7, 1 }, { 7, 2 }, { 6, 3 }, { 5, 4 }, { 4, 5 }, { 3, 6 }, { 2, 7 }, { 3, 7 }, { 4, 6 }, { 5, 5 }, { 6, 4 }, { 7, 3 }, { 7, 4 }, { 6, 5 }, { 5, 6 }, { 4, 7 }, { 5, 7 }, { 6, 6 }, { 7, 5 }, { 7, 6 }, { 6, 7 }, { 7, 7 } }; static const int16_t tab_scan_16x16[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 0, 4}, { 1, 3}, { 2, 2}, { 3, 1}, { 4, 0}, { 5, 0}, { 4, 1}, { 3, 2}, { 2, 3}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 3}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 0, 8}, { 1, 7}, { 2, 6}, { 3, 5}, { 4, 4}, { 5, 3}, { 6, 2}, { 7, 1}, { 8, 0}, { 9, 0}, { 8, 1}, { 7, 2}, { 6, 3}, { 5, 4}, { 4, 5}, { 3, 6}, { 2, 7}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 7}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 3}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 0, 12}, { 1, 11}, { 2, 10}, { 3, 9}, { 4, 8}, { 5, 7}, { 6, 6}, { 7, 5}, { 8, 4}, { 9, 3}, { 10, 2}, { 11, 1}, { 12, 0}, { 13, 0}, { 12, 1}, { 11, 2}, { 10, 3}, { 9, 4}, { 8, 5}, { 7, 6}, { 6, 7}, { 5, 8}, { 4, 9}, { 3, 10}, { 2, 11}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 11}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 7}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 3}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 4, 12}, { 5, 11}, { 6, 10}, { 7, 9}, { 8, 8}, { 9, 7}, { 10, 6}, { 11, 5}, { 12, 4}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 13, 4}, { 12, 5}, { 11, 6}, { 10, 7}, { 9, 8}, { 8, 9}, { 7, 10}, { 6, 11}, { 5, 12}, { 4, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 11}, { 8, 10}, { 9, 9}, { 10, 8}, { 11, 7}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 3}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 11, 8}, { 10, 9}, { 9, 10}, { 8, 11}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 8, 12}, { 9, 11}, { 10, 10}, { 11, 9}, { 12, 8}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 13, 8}, { 12, 9}, { 11, 10}, { 10, 11}, { 9, 12}, { 8, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 8, 14}, { 9, 13}, { 10, 12}, { 11, 11}, { 12, 10}, { 13, 9}, { 14, 8}, { 15, 7}, { 15, 8}, { 14, 9}, { 13, 10}, { 12, 11}, { 11, 12}, { 10, 13}, { 9, 14}, { 8, 15}, { 9, 15}, { 10, 14}, { 11, 13}, { 12, 12}, { 13, 11}, { 14, 10}, { 15, 9}, { 15, 10}, { 14, 11}, { 13, 12}, { 12, 13}, { 11, 14}, { 10, 15}, { 11, 15}, { 12, 14}, { 13, 13}, { 14, 12}, { 15, 11}, { 15, 12}, { 14, 13}, { 13, 14}, { 12, 15}, { 13, 15}, { 14, 14}, { 15, 13}, { 15, 14}, { 14, 15}, { 15, 15} }; /* --------------------------------------------------------------------------- */ static const int16_t tab_scan_1x4[4][2] = { { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, }; /* --------------------------------------------------------------------------- */ static const int16_t tab_scan_4x1[4][2] = { { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, }; static const int16_t tab_scan_2x8[16][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 3, 1 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 7, 1 } }; static const int16_t tab_scan_8x2[16][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 1, 2 }, { 0, 3 }, { 0, 4 }, { 1, 3 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 } }; static const int16_t tab_scan_4x16[64][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3} }; static const int16_t tab_scan_16x4[64][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15} }; static const int16_t tab_scan_8x32[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 16, 0}, { 17, 0}, { 16, 1}, { 16, 2}, { 17, 1}, { 18, 0}, { 19, 0}, { 18, 1}, { 17, 2}, { 16, 3}, { 17, 3}, { 18, 2}, { 19, 1}, { 19, 2}, { 18, 3}, { 19, 3}, { 20, 0}, { 21, 0}, { 20, 1}, { 20, 2}, { 21, 1}, { 22, 0}, { 23, 0}, { 22, 1}, { 21, 2}, { 20, 3}, { 21, 3}, { 22, 2}, { 23, 1}, { 23, 2}, { 22, 3}, { 23, 3}, { 16, 4}, { 17, 4}, { 16, 5}, { 16, 6}, { 17, 5}, { 18, 4}, { 19, 4}, { 18, 5}, { 17, 6}, { 16, 7}, { 17, 7}, { 18, 6}, { 19, 5}, { 19, 6}, { 18, 7}, { 19, 7}, { 20, 4}, { 21, 4}, { 20, 5}, { 20, 6}, { 21, 5}, { 22, 4}, { 23, 4}, { 22, 5}, { 21, 6}, { 20, 7}, { 21, 7}, { 22, 6}, { 23, 5}, { 23, 6}, { 22, 7}, { 23, 7}, { 24, 0}, { 25, 0}, { 24, 1}, { 24, 2}, { 25, 1}, { 26, 0}, { 27, 0}, { 26, 1}, { 25, 2}, { 24, 3}, { 25, 3}, { 26, 2}, { 27, 1}, { 27, 2}, { 26, 3}, { 27, 3}, { 28, 0}, { 29, 0}, { 28, 1}, { 28, 2}, { 29, 1}, { 30, 0}, { 31, 0}, { 30, 1}, { 29, 2}, { 28, 3}, { 29, 3}, { 30, 2}, { 31, 1}, { 31, 2}, { 30, 3}, { 31, 3}, { 24, 4}, { 25, 4}, { 24, 5}, { 24, 6}, { 25, 5}, { 26, 4}, { 27, 4}, { 26, 5}, { 25, 6}, { 24, 7}, { 25, 7}, { 26, 6}, { 27, 5}, { 27, 6}, { 26, 7}, { 27, 7}, { 28, 4}, { 29, 4}, { 28, 5}, { 28, 6}, { 29, 5}, { 30, 4}, { 31, 4}, { 30, 5}, { 29, 6}, { 28, 7}, { 29, 7}, { 30, 6}, { 31, 5}, { 31, 6}, { 30, 7}, { 31, 7} }; static const int16_t tab_scan_32x8[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 0, 16}, { 1, 16}, { 0, 17}, { 0, 18}, { 1, 17}, { 2, 16}, { 3, 16}, { 2, 17}, { 1, 18}, { 0, 19}, { 1, 19}, { 2, 18}, { 3, 17}, { 3, 18}, { 2, 19}, { 3, 19}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 4, 16}, { 5, 16}, { 4, 17}, { 4, 18}, { 5, 17}, { 6, 16}, { 7, 16}, { 6, 17}, { 5, 18}, { 4, 19}, { 5, 19}, { 6, 18}, { 7, 17}, { 7, 18}, { 6, 19}, { 7, 19}, { 0, 20}, { 1, 20}, { 0, 21}, { 0, 22}, { 1, 21}, { 2, 20}, { 3, 20}, { 2, 21}, { 1, 22}, { 0, 23}, { 1, 23}, { 2, 22}, { 3, 21}, { 3, 22}, { 2, 23}, { 3, 23}, { 0, 24}, { 1, 24}, { 0, 25}, { 0, 26}, { 1, 25}, { 2, 24}, { 3, 24}, { 2, 25}, { 1, 26}, { 0, 27}, { 1, 27}, { 2, 26}, { 3, 25}, { 3, 26}, { 2, 27}, { 3, 27}, { 4, 20}, { 5, 20}, { 4, 21}, { 4, 22}, { 5, 21}, { 6, 20}, { 7, 20}, { 6, 21}, { 5, 22}, { 4, 23}, { 5, 23}, { 6, 22}, { 7, 21}, { 7, 22}, { 6, 23}, { 7, 23}, { 4, 24}, { 5, 24}, { 4, 25}, { 4, 26}, { 5, 25}, { 6, 24}, { 7, 24}, { 6, 25}, { 5, 26}, { 4, 27}, { 5, 27}, { 6, 26}, { 7, 25}, { 7, 26}, { 6, 27}, { 7, 27}, { 0, 28}, { 1, 28}, { 0, 29}, { 0, 30}, { 1, 29}, { 2, 28}, { 3, 28}, { 2, 29}, { 1, 30}, { 0, 31}, { 1, 31}, { 2, 30}, { 3, 29}, { 3, 30}, { 2, 31}, { 3, 31}, { 4, 28}, { 5, 28}, { 4, 29}, { 4, 30}, { 5, 29}, { 6, 28}, { 7, 28}, { 6, 29}, { 5, 30}, { 4, 31}, { 5, 31}, { 6, 30}, { 7, 29}, { 7, 30}, { 6, 31}, { 7, 31} }; static const int16_t tab_scan_cg_8x8[64][2] = { { 0, 0 }, { 1, 0 }, { 0, 1 }, { 0, 2 }, { 1, 1 }, { 2, 0 }, { 3, 0 }, { 2, 1 }, { 1, 2 }, { 0, 3 }, { 1, 3 }, { 2, 2 }, { 3, 1 }, { 3, 2 }, { 2, 3 }, { 3, 3 }, { 4, 0 }, { 5, 0 }, { 4, 1 }, { 4, 2 }, { 5, 1 }, { 6, 0 }, { 7, 0 }, { 6, 1 }, { 5, 2 }, { 4, 3 }, { 5, 3 }, { 6, 2 }, { 7, 1 }, { 7, 2 }, { 6, 3 }, { 7, 3 }, { 0, 4 }, { 1, 4 }, { 0, 5 }, { 0, 6 }, { 1, 5 }, { 2, 4 }, { 3, 4 }, { 2, 5 }, { 1, 6 }, { 0, 7 }, { 1, 7 }, { 2, 6 }, { 3, 5 }, { 3, 6 }, { 2, 7 }, { 3, 7 }, { 4, 4 }, { 5, 4 }, { 4, 5 }, { 4, 6 }, { 5, 5 }, { 6, 4 }, { 7, 4 }, { 6, 5 }, { 5, 6 }, { 4, 7 }, { 5, 7 }, { 6, 6 }, { 7, 5 }, { 7, 6 }, { 6, 7 }, { 7, 7 } }; static const int16_t tab_scan_cg_16x16[256][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 8, 8}, { 9, 8}, { 8, 9}, { 8, 10}, { 9, 9}, { 10, 8}, { 11, 8}, { 10, 9}, { 9, 10}, { 8, 11}, { 9, 11}, { 10, 10}, { 11, 9}, { 11, 10}, { 10, 11}, { 11, 11}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 12, 8}, { 13, 8}, { 12, 9}, { 12, 10}, { 13, 9}, { 14, 8}, { 15, 8}, { 14, 9}, { 13, 10}, { 12, 11}, { 13, 11}, { 14, 10}, { 15, 9}, { 15, 10}, { 14, 11}, { 15, 11}, { 8, 12}, { 9, 12}, { 8, 13}, { 8, 14}, { 9, 13}, { 10, 12}, { 11, 12}, { 10, 13}, { 9, 14}, { 8, 15}, { 9, 15}, { 10, 14}, { 11, 13}, { 11, 14}, { 10, 15}, { 11, 15}, { 12, 12}, { 13, 12}, { 12, 13}, { 12, 14}, { 13, 13}, { 14, 12}, { 15, 12}, { 14, 13}, { 13, 14}, { 12, 15}, { 13, 15}, { 14, 14}, { 15, 13}, { 15, 14}, { 14, 15}, { 15, 15} }; static const int16_t tab_scan_cg_32x32[1024][2] = { { 0, 0}, { 1, 0}, { 0, 1}, { 0, 2}, { 1, 1}, { 2, 0}, { 3, 0}, { 2, 1}, { 1, 2}, { 0, 3}, { 1, 3}, { 2, 2}, { 3, 1}, { 3, 2}, { 2, 3}, { 3, 3}, { 4, 0}, { 5, 0}, { 4, 1}, { 4, 2}, { 5, 1}, { 6, 0}, { 7, 0}, { 6, 1}, { 5, 2}, { 4, 3}, { 5, 3}, { 6, 2}, { 7, 1}, { 7, 2}, { 6, 3}, { 7, 3}, { 0, 4}, { 1, 4}, { 0, 5}, { 0, 6}, { 1, 5}, { 2, 4}, { 3, 4}, { 2, 5}, { 1, 6}, { 0, 7}, { 1, 7}, { 2, 6}, { 3, 5}, { 3, 6}, { 2, 7}, { 3, 7}, { 0, 8}, { 1, 8}, { 0, 9}, { 0, 10}, { 1, 9}, { 2, 8}, { 3, 8}, { 2, 9}, { 1, 10}, { 0, 11}, { 1, 11}, { 2, 10}, { 3, 9}, { 3, 10}, { 2, 11}, { 3, 11}, { 4, 4}, { 5, 4}, { 4, 5}, { 4, 6}, { 5, 5}, { 6, 4}, { 7, 4}, { 6, 5}, { 5, 6}, { 4, 7}, { 5, 7}, { 6, 6}, { 7, 5}, { 7, 6}, { 6, 7}, { 7, 7}, { 8, 0}, { 9, 0}, { 8, 1}, { 8, 2}, { 9, 1}, { 10, 0}, { 11, 0}, { 10, 1}, { 9, 2}, { 8, 3}, { 9, 3}, { 10, 2}, { 11, 1}, { 11, 2}, { 10, 3}, { 11, 3}, { 12, 0}, { 13, 0}, { 12, 1}, { 12, 2}, { 13, 1}, { 14, 0}, { 15, 0}, { 14, 1}, { 13, 2}, { 12, 3}, { 13, 3}, { 14, 2}, { 15, 1}, { 15, 2}, { 14, 3}, { 15, 3}, { 8, 4}, { 9, 4}, { 8, 5}, { 8, 6}, { 9, 5}, { 10, 4}, { 11, 4}, { 10, 5}, { 9, 6}, { 8, 7}, { 9, 7}, { 10, 6}, { 11, 5}, { 11, 6}, { 10, 7}, { 11, 7}, { 4, 8}, { 5, 8}, { 4, 9}, { 4, 10}, { 5, 9}, { 6, 8}, { 7, 8}, { 6, 9}, { 5, 10}, { 4, 11}, { 5, 11}, { 6, 10}, { 7, 9}, { 7, 10}, { 6, 11}, { 7, 11}, { 0, 12}, { 1, 12}, { 0, 13}, { 0, 14}, { 1, 13}, { 2, 12}, { 3, 12}, { 2, 13}, { 1, 14}, { 0, 15}, { 1, 15}, { 2, 14}, { 3, 13}, { 3, 14}, { 2, 15}, { 3, 15}, { 0, 16}, { 1, 16}, { 0, 17}, { 0, 18}, { 1, 17}, { 2, 16}, { 3, 16}, { 2, 17}, { 1, 18}, { 0, 19}, { 1, 19}, { 2, 18}, { 3, 17}, { 3, 18}, { 2, 19}, { 3, 19}, { 4, 12}, { 5, 12}, { 4, 13}, { 4, 14}, { 5, 13}, { 6, 12}, { 7, 12}, { 6, 13}, { 5, 14}, { 4, 15}, { 5, 15}, { 6, 14}, { 7, 13}, { 7, 14}, { 6, 15}, { 7, 15}, { 8, 8}, { 9, 8}, { 8, 9}, { 8, 10}, { 9, 9}, { 10, 8}, { 11, 8}, { 10, 9}, { 9, 10}, { 8, 11}, { 9, 11}, { 10, 10}, { 11, 9}, { 11, 10}, { 10, 11}, { 11, 11}, { 12, 4}, { 13, 4}, { 12, 5}, { 12, 6}, { 13, 5}, { 14, 4}, { 15, 4}, { 14, 5}, { 13, 6}, { 12, 7}, { 13, 7}, { 14, 6}, { 15, 5}, { 15, 6}, { 14, 7}, { 15, 7}, { 16, 0}, { 17, 0}, { 16, 1}, { 16, 2}, { 17, 1}, { 18, 0}, { 19, 0}, { 18, 1}, { 17, 2}, { 16, 3}, { 17, 3}, { 18, 2}, { 19, 1}, { 19, 2}, { 18, 3}, { 19, 3}, { 20, 0}, { 21, 0}, { 20, 1}, { 20, 2}, { 21, 1}, { 22, 0}, { 23, 0}, { 22, 1}, { 21, 2}, { 20, 3}, { 21, 3}, { 22, 2}, { 23, 1}, { 23, 2}, { 22, 3}, { 23, 3}, { 16, 4}, { 17, 4}, { 16, 5}, { 16, 6}, { 17, 5}, { 18, 4}, { 19, 4}, { 18, 5}, { 17, 6}, { 16, 7}, { 17, 7}, { 18, 6}, { 19, 5}, { 19, 6}, { 18, 7}, { 19, 7}, { 12, 8}, { 13, 8}, { 12, 9}, { 12, 10}, { 13, 9}, { 14, 8}, { 15, 8}, { 14, 9}, { 13, 10}, { 12, 11}, { 13, 11}, { 14, 10}, { 15, 9}, { 15, 10}, { 14, 11}, { 15, 11}, { 8, 12}, { 9, 12}, { 8, 13}, { 8, 14}, { 9, 13}, { 10, 12}, { 11, 12}, { 10, 13}, { 9, 14}, { 8, 15}, { 9, 15}, { 10, 14}, { 11, 13}, { 11, 14}, { 10, 15}, { 11, 15}, { 4, 16}, { 5, 16}, { 4, 17}, { 4, 18}, { 5, 17}, { 6, 16}, { 7, 16}, { 6, 17}, { 5, 18}, { 4, 19}, { 5, 19}, { 6, 18}, { 7, 17}, { 7, 18}, { 6, 19}, { 7, 19}, { 0, 20}, { 1, 20}, { 0, 21}, { 0, 22}, { 1, 21}, { 2, 20}, { 3, 20}, { 2, 21}, { 1, 22}, { 0, 23}, { 1, 23}, { 2, 22}, { 3, 21}, { 3, 22}, { 2, 23}, { 3, 23}, { 0, 24}, { 1, 24}, { 0, 25}, { 0, 26}, { 1, 25}, { 2, 24}, { 3, 24}, { 2, 25}, { 1, 26}, { 0, 27}, { 1, 27}, { 2, 26}, { 3, 25}, { 3, 26}, { 2, 27}, { 3, 27}, { 4, 20}, { 5, 20}, { 4, 21}, { 4, 22}, { 5, 21}, { 6, 20}, { 7, 20}, { 6, 21}, { 5, 22}, { 4, 23}, { 5, 23}, { 6, 22}, { 7, 21}, { 7, 22}, { 6, 23}, { 7, 23}, { 8, 16}, { 9, 16}, { 8, 17}, { 8, 18}, { 9, 17}, { 10, 16}, { 11, 16}, { 10, 17}, { 9, 18}, { 8, 19}, { 9, 19}, { 10, 18}, { 11, 17}, { 11, 18}, { 10, 19}, { 11, 19}, { 12, 12}, { 13, 12}, { 12, 13}, { 12, 14}, { 13, 13}, { 14, 12}, { 15, 12}, { 14, 13}, { 13, 14}, { 12, 15}, { 13, 15}, { 14, 14}, { 15, 13}, { 15, 14}, { 14, 15}, { 15, 15}, { 16, 8}, { 17, 8}, { 16, 9}, { 16, 10}, { 17, 9}, { 18, 8}, { 19, 8}, { 18, 9}, { 17, 10}, { 16, 11}, { 17, 11}, { 18, 10}, { 19, 9}, { 19, 10}, { 18, 11}, { 19, 11}, { 20, 4}, { 21, 4}, { 20, 5}, { 20, 6}, { 21, 5}, { 22, 4}, { 23, 4}, { 22, 5}, { 21, 6}, { 20, 7}, { 21, 7}, { 22, 6}, { 23, 5}, { 23, 6}, { 22, 7}, { 23, 7}, { 24, 0}, { 25, 0}, { 24, 1}, { 24, 2}, { 25, 1}, { 26, 0}, { 27, 0}, { 26, 1}, { 25, 2}, { 24, 3}, { 25, 3}, { 26, 2}, { 27, 1}, { 27, 2}, { 26, 3}, { 27, 3}, { 28, 0}, { 29, 0}, { 28, 1}, { 28, 2}, { 29, 1}, { 30, 0}, { 31, 0}, { 30, 1}, { 29, 2}, { 28, 3}, { 29, 3}, { 30, 2}, { 31, 1}, { 31, 2}, { 30, 3}, { 31, 3}, { 24, 4}, { 25, 4}, { 24, 5}, { 24, 6}, { 25, 5}, { 26, 4}, { 27, 4}, { 26, 5}, { 25, 6}, { 24, 7}, { 25, 7}, { 26, 6}, { 27, 5}, { 27, 6}, { 26, 7}, { 27, 7}, { 20, 8}, { 21, 8}, { 20, 9}, { 20, 10}, { 21, 9}, { 22, 8}, { 23, 8}, { 22, 9}, { 21, 10}, { 20, 11}, { 21, 11}, { 22, 10}, { 23, 9}, { 23, 10}, { 22, 11}, { 23, 11}, { 16, 12}, { 17, 12}, { 16, 13}, { 16, 14}, { 17, 13}, { 18, 12}, { 19, 12}, { 18, 13}, { 17, 14}, { 16, 15}, { 17, 15}, { 18, 14}, { 19, 13}, { 19, 14}, { 18, 15}, { 19, 15}, { 12, 16}, { 13, 16}, { 12, 17}, { 12, 18}, { 13, 17}, { 14, 16}, { 15, 16}, { 14, 17}, { 13, 18}, { 12, 19}, { 13, 19}, { 14, 18}, { 15, 17}, { 15, 18}, { 14, 19}, { 15, 19}, { 8, 20}, { 9, 20}, { 8, 21}, { 8, 22}, { 9, 21}, { 10, 20}, { 11, 20}, { 10, 21}, { 9, 22}, { 8, 23}, { 9, 23}, { 10, 22}, { 11, 21}, { 11, 22}, { 10, 23}, { 11, 23}, { 4, 24}, { 5, 24}, { 4, 25}, { 4, 26}, { 5, 25}, { 6, 24}, { 7, 24}, { 6, 25}, { 5, 26}, { 4, 27}, { 5, 27}, { 6, 26}, { 7, 25}, { 7, 26}, { 6, 27}, { 7, 27}, { 0, 28}, { 1, 28}, { 0, 29}, { 0, 30}, { 1, 29}, { 2, 28}, { 3, 28}, { 2, 29}, { 1, 30}, { 0, 31}, { 1, 31}, { 2, 30}, { 3, 29}, { 3, 30}, { 2, 31}, { 3, 31}, { 4, 28}, { 5, 28}, { 4, 29}, { 4, 30}, { 5, 29}, { 6, 28}, { 7, 28}, { 6, 29}, { 5, 30}, { 4, 31}, { 5, 31}, { 6, 30}, { 7, 29}, { 7, 30}, { 6, 31}, { 7, 31}, { 8, 24}, { 9, 24}, { 8, 25}, { 8, 26}, { 9, 25}, { 10, 24}, { 11, 24}, { 10, 25}, { 9, 26}, { 8, 27}, { 9, 27}, { 10, 26}, { 11, 25}, { 11, 26}, { 10, 27}, { 11, 27}, { 12, 20}, { 13, 20}, { 12, 21}, { 12, 22}, { 13, 21}, { 14, 20}, { 15, 20}, { 14, 21}, { 13, 22}, { 12, 23}, { 13, 23}, { 14, 22}, { 15, 21}, { 15, 22}, { 14, 23}, { 15, 23}, { 16, 16}, { 17, 16}, { 16, 17}, { 16, 18}, { 17, 17}, { 18, 16}, { 19, 16}, { 18, 17}, { 17, 18}, { 16, 19}, { 17, 19}, { 18, 18}, { 19, 17}, { 19, 18}, { 18, 19}, { 19, 19}, { 20, 12}, { 21, 12}, { 20, 13}, { 20, 14}, { 21, 13}, { 22, 12}, { 23, 12}, { 22, 13}, { 21, 14}, { 20, 15}, { 21, 15}, { 22, 14}, { 23, 13}, { 23, 14}, { 22, 15}, { 23, 15}, { 24, 8}, { 25, 8}, { 24, 9}, { 24, 10}, { 25, 9}, { 26, 8}, { 27, 8}, { 26, 9}, { 25, 10}, { 24, 11}, { 25, 11}, { 26, 10}, { 27, 9}, { 27, 10}, { 26, 11}, { 27, 11}, { 28, 4}, { 29, 4}, { 28, 5}, { 28, 6}, { 29, 5}, { 30, 4}, { 31, 4}, { 30, 5}, { 29, 6}, { 28, 7}, { 29, 7}, { 30, 6}, { 31, 5}, { 31, 6}, { 30, 7}, { 31, 7}, { 28, 8}, { 29, 8}, { 28, 9}, { 28, 10}, { 29, 9}, { 30, 8}, { 31, 8}, { 30, 9}, { 29, 10}, { 28, 11}, { 29, 11}, { 30, 10}, { 31, 9}, { 31, 10}, { 30, 11}, { 31, 11}, { 24, 12}, { 25, 12}, { 24, 13}, { 24, 14}, { 25, 13}, { 26, 12}, { 27, 12}, { 26, 13}, { 25, 14}, { 24, 15}, { 25, 15}, { 26, 14}, { 27, 13}, { 27, 14}, { 26, 15}, { 27, 15}, { 20, 16}, { 21, 16}, { 20, 17}, { 20, 18}, { 21, 17}, { 22, 16}, { 23, 16}, { 22, 17}, { 21, 18}, { 20, 19}, { 21, 19}, { 22, 18}, { 23, 17}, { 23, 18}, { 22, 19}, { 23, 19}, { 16, 20}, { 17, 20}, { 16, 21}, { 16, 22}, { 17, 21}, { 18, 20}, { 19, 20}, { 18, 21}, { 17, 22}, { 16, 23}, { 17, 23}, { 18, 22}, { 19, 21}, { 19, 22}, { 18, 23}, { 19, 23}, { 12, 24}, { 13, 24}, { 12, 25}, { 12, 26}, { 13, 25}, { 14, 24}, { 15, 24}, { 14, 25}, { 13, 26}, { 12, 27}, { 13, 27}, { 14, 26}, { 15, 25}, { 15, 26}, { 14, 27}, { 15, 27}, { 8, 28}, { 9, 28}, { 8, 29}, { 8, 30}, { 9, 29}, { 10, 28}, { 11, 28}, { 10, 29}, { 9, 30}, { 8, 31}, { 9, 31}, { 10, 30}, { 11, 29}, { 11, 30}, { 10, 31}, { 11, 31}, { 12, 28}, { 13, 28}, { 12, 29}, { 12, 30}, { 13, 29}, { 14, 28}, { 15, 28}, { 14, 29}, { 13, 30}, { 12, 31}, { 13, 31}, { 14, 30}, { 15, 29}, { 15, 30}, { 14, 31}, { 15, 31}, { 16, 24}, { 17, 24}, { 16, 25}, { 16, 26}, { 17, 25}, { 18, 24}, { 19, 24}, { 18, 25}, { 17, 26}, { 16, 27}, { 17, 27}, { 18, 26}, { 19, 25}, { 19, 26}, { 18, 27}, { 19, 27}, { 20, 20}, { 21, 20}, { 20, 21}, { 20, 22}, { 21, 21}, { 22, 20}, { 23, 20}, { 22, 21}, { 21, 22}, { 20, 23}, { 21, 23}, { 22, 22}, { 23, 21}, { 23, 22}, { 22, 23}, { 23, 23}, { 24, 16}, { 25, 16}, { 24, 17}, { 24, 18}, { 25, 17}, { 26, 16}, { 27, 16}, { 26, 17}, { 25, 18}, { 24, 19}, { 25, 19}, { 26, 18}, { 27, 17}, { 27, 18}, { 26, 19}, { 27, 19}, { 28, 12}, { 29, 12}, { 28, 13}, { 28, 14}, { 29, 13}, { 30, 12}, { 31, 12}, { 30, 13}, { 29, 14}, { 28, 15}, { 29, 15}, { 30, 14}, { 31, 13}, { 31, 14}, { 30, 15}, { 31, 15}, { 28, 16}, { 29, 16}, { 28, 17}, { 28, 18}, { 29, 17}, { 30, 16}, { 31, 16}, { 30, 17}, { 29, 18}, { 28, 19}, { 29, 19}, { 30, 18}, { 31, 17}, { 31, 18}, { 30, 19}, { 31, 19}, { 24, 20}, { 25, 20}, { 24, 21}, { 24, 22}, { 25, 21}, { 26, 20}, { 27, 20}, { 26, 21}, { 25, 22}, { 24, 23}, { 25, 23}, { 26, 22}, { 27, 21}, { 27, 22}, { 26, 23}, { 27, 23}, { 20, 24}, { 21, 24}, { 20, 25}, { 20, 26}, { 21, 25}, { 22, 24}, { 23, 24}, { 22, 25}, { 21, 26}, { 20, 27}, { 21, 27}, { 22, 26}, { 23, 25}, { 23, 26}, { 22, 27}, { 23, 27}, { 16, 28}, { 17, 28}, { 16, 29}, { 16, 30}, { 17, 29}, { 18, 28}, { 19, 28}, { 18, 29}, { 17, 30}, { 16, 31}, { 17, 31}, { 18, 30}, { 19, 29}, { 19, 30}, { 18, 31}, { 19, 31}, { 20, 28}, { 21, 28}, { 20, 29}, { 20, 30}, { 21, 29}, { 22, 28}, { 23, 28}, { 22, 29}, { 21, 30}, { 20, 31}, { 21, 31}, { 22, 30}, { 23, 29}, { 23, 30}, { 22, 31}, { 23, 31}, { 24, 24}, { 25, 24}, { 24, 25}, { 24, 26}, { 25, 25}, { 26, 24}, { 27, 24}, { 26, 25}, { 25, 26}, { 24, 27}, { 25, 27}, { 26, 26}, { 27, 25}, { 27, 26}, { 26, 27}, { 27, 27}, { 28, 20}, { 29, 20}, { 28, 21}, { 28, 22}, { 29, 21}, { 30, 20}, { 31, 20}, { 30, 21}, { 29, 22}, { 28, 23}, { 29, 23}, { 30, 22}, { 31, 21}, { 31, 22}, { 30, 23}, { 31, 23}, { 28, 24}, { 29, 24}, { 28, 25}, { 28, 26}, { 29, 25}, { 30, 24}, { 31, 24}, { 30, 25}, { 29, 26}, { 28, 27}, { 29, 27}, { 30, 26}, { 31, 25}, { 31, 26}, { 30, 27}, { 31, 27}, { 24, 28}, { 25, 28}, { 24, 29}, { 24, 30}, { 25, 29}, { 26, 28}, { 27, 28}, { 26, 29}, { 25, 30}, { 24, 31}, { 25, 31}, { 26, 30}, { 27, 29}, { 27, 30}, { 26, 31}, { 27, 31}, { 28, 28}, { 29, 28}, { 28, 29}, { 28, 30}, { 29, 29}, { 30, 28}, { 31, 28}, { 30, 29}, { 29, 30}, { 28, 31}, { 29, 31}, { 30, 30}, { 31, 29}, { 31, 30}, { 30, 31}, { 31, 31} }; /* --------------------------------------------------------------------------- * ϵɨ */ static const int16_t(*tab_scan_coeff[4][4])[2] = { /* 4x4 */ {tab_scan_4x4, NULL, NULL, tab_scan_4x4}, /* 8x8, 16x4, 4x16 */ {tab_scan_cg_8x8, tab_scan_4x16, tab_scan_16x4, tab_scan_cg_8x8}, /* 16x16, 32x8, 8x32 */ {tab_scan_cg_16x16, tab_scan_8x32, tab_scan_32x8, tab_scan_cg_16x16}, /* 32x32, 64x16, 16x64 */ {tab_scan_cg_32x32, NULL, NULL, tab_scan_cg_32x32}, }; /* --------------------------------------------------------------------------- * CGɨ˳ */ static const int16_t(*tab_scan_cg[4][4])[2] = { /* 4x4 */ {tab_scan_2x2, NULL, NULL, tab_scan_2x2}, /* 8x8, 16x4, 4x16 */ {tab_scan_2x2, tab_scan_1x4, tab_scan_4x1, tab_scan_2x2}, /* 16x16, 32x8, 8x32 */ {tab_scan_4x4, tab_scan_2x8, tab_scan_8x2, tab_scan_4x4}, /* 32x32, 64x16, 16x64 */ {tab_scan_8x8, NULL, NULL, tab_scan_8x8}, }; #ifdef __cplusplus } #endif #endif // DAVS2_SCAN_TAB_H davs2-1.6/source/common/threadpool.cc000066400000000000000000000262771337322544400176640ustar00rootroot00000000000000/* * threadpool.cc * * Description of this file: * thread pooling functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "threadpool.h" /** * =========================================================================== * type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * job */ typedef struct threadpool_job_t { davs2_threadpool_func_t func; void *arg1; int arg2; void *ret; int wait; } threadpool_job_t; /* --------------------------------------------------------------------------- * synchronized job list */ typedef struct davs2_sync_job_list_t { int i_max_size; int i_size; davs2_thread_mutex_t mutex; davs2_thread_cond_t cv_fill; /* event signaling that the list became fuller */ davs2_thread_cond_t cv_empty; /* event signaling that the list became emptier */ threadpool_job_t *list[DAVS2_WORK_MAX + 2]; } davs2_sync_job_list_t; /* --------------------------------------------------------------------------- * thread pool */ struct davs2_threadpool_t { int i_exit; /* exit flag */ int num_total_threads; /* thread number in pool */ int num_run_threads; /* thread number running */ davs2_threadpool_func_t init_func; void *init_arg; int init_arg2; /* requires a synchronized list structure and associated methods, so use what is already implemented for jobs */ davs2_sync_job_list_t uninit; /* list of jobs that are awaiting use */ davs2_sync_job_list_t run; /* list of jobs that are queued for processing by the pool */ davs2_sync_job_list_t done; /* list of jobs that have finished processing */ /* handler of threads in the pool */ davs2_thread_t thread_handle[AVS2_THREAD_MAX]; }; /** * =========================================================================== * list operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static threadpool_job_t *davs2_job_shift(threadpool_job_t **list) { threadpool_job_t *job = list[0]; int i; for (i = 0; list[i]; i++) { list[i] = list[i + 1]; } assert(job); return job; } /** * =========================================================================== * list operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static int davs2_sync_job_list_init(davs2_sync_job_list_t *slist, int i_max_size) { if (i_max_size < 0) { return -1; } slist->i_max_size = i_max_size; slist->i_size = 0; memset(slist->list, 0, sizeof(slist->list)); if (davs2_thread_mutex_init(&slist->mutex, NULL) || davs2_thread_cond_init(&slist->cv_fill, NULL) || davs2_thread_cond_init(&slist->cv_empty, NULL)) { return -1; } return 0; } /* --------------------------------------------------------------------------- */ static void davs2_threadpool_list_delete(davs2_sync_job_list_t *slist) { davs2_thread_mutex_destroy(&slist->mutex); davs2_thread_cond_destroy(&slist->cv_fill); davs2_thread_cond_destroy(&slist->cv_empty); slist->i_size = 0; } /* --------------------------------------------------------------------------- */ static void davs2_sync_job_list_push(davs2_sync_job_list_t *slist, threadpool_job_t *job) { davs2_thread_mutex_lock(&slist->mutex); /* lock */ while (slist->i_size == slist->i_max_size) { davs2_thread_cond_wait(&slist->cv_empty, &slist->mutex); } slist->list[slist->i_size++] = job; davs2_thread_mutex_unlock(&slist->mutex); /* unlock */ davs2_thread_cond_broadcast(&slist->cv_fill); } /* --------------------------------------------------------------------------- */ static threadpool_job_t *davs2_sync_job_list_pop(davs2_sync_job_list_t *slist) { threadpool_job_t *job; davs2_thread_mutex_lock(&slist->mutex); /* lock */ while (!slist->i_size) { davs2_thread_cond_wait(&slist->cv_fill, &slist->mutex); } job = slist->list[--slist->i_size]; slist->list[slist->i_size] = NULL; davs2_thread_cond_broadcast(&slist->cv_empty); davs2_thread_mutex_unlock(&slist->mutex); /* unlock */ return job; } /** * =========================================================================== * thread pool operators * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void *davs2_threadpool_thread(void *arg) { davs2_threadpool_t *pool = (davs2_threadpool_t *)arg; /* init */ if (pool->init_func) { pool->init_func(pool->init_arg, pool->init_arg2); } /* loop until exit flag is set */ while (pool->i_exit != AVS2_EXIT_THREAD) { threadpool_job_t *job = NULL; /* fetch a job */ davs2_thread_mutex_lock(&pool->run.mutex); /* lock */ while (pool->i_exit != AVS2_EXIT_THREAD && !pool->run.i_size) { davs2_thread_cond_wait(&pool->run.cv_fill, &pool->run.mutex); } if (pool->run.i_size) { job = davs2_job_shift(pool->run.list); pool->run.i_size--; } davs2_thread_mutex_unlock(&pool->run.mutex); /* unlock */ /* do the job */ if (!job) { continue; } job->ret = job->func(job->arg1, job->arg2); /* execute the function */ /* the job is done */ if (job->wait) { davs2_sync_job_list_push(&pool->done, job); } else { davs2_sync_job_list_push(&pool->uninit, job); } } return NULL; } /* --------------------------------------------------------------------------- */ int davs2_threadpool_init(davs2_threadpool_t **p_pool, int threads, davs2_threadpool_func_t init_func, void *init_arg1, int init_arg2) { davs2_threadpool_t *pool; uint32_t mem_size; uint8_t *mem_ptr; int i; if (threads <= 0) { return -1; } mem_size = sizeof(davs2_threadpool_t) + DAVS2_WORK_MAX * sizeof(threadpool_job_t) + CACHE_LINE_SIZE * (DAVS2_WORK_MAX + 2); CHECKED_MALLOCZERO(mem_ptr, uint8_t *, mem_size); *p_pool = pool = (davs2_threadpool_t *)mem_ptr; mem_ptr += sizeof(davs2_threadpool_t); ALIGN_POINTER(mem_ptr); pool->init_func = init_func; pool->init_arg = init_arg1; pool->init_arg2 = init_arg2; pool->num_total_threads = DAVS2_MIN(threads, AVS2_THREAD_MAX); pool->num_run_threads = 0; if (davs2_sync_job_list_init(&pool->uninit, DAVS2_WORK_MAX) || davs2_sync_job_list_init(&pool->run, DAVS2_WORK_MAX) || davs2_sync_job_list_init(&pool->done, DAVS2_WORK_MAX)) { goto fail; } for (i = 0; i < DAVS2_WORK_MAX; i++) { threadpool_job_t *job = (threadpool_job_t *)mem_ptr; mem_ptr += sizeof(threadpool_job_t); ALIGN_POINTER(mem_ptr); davs2_sync_job_list_push(&pool->uninit, job); } for (i = 0; i < pool->num_total_threads; i++) { if (davs2_thread_create(pool->thread_handle + i, NULL, davs2_threadpool_thread, pool)) { goto fail; } } return 0; fail: return -1; } /* --------------------------------------------------------------------------- */ void davs2_threadpool_run(davs2_threadpool_t *pool, davs2_threadpool_func_t func, void *arg1, int arg2, int wait_sign) { threadpool_job_t *job = davs2_sync_job_list_pop(&pool->uninit); job->func = func; job->arg1 = arg1; job->arg2 = arg2; job->wait = wait_sign; davs2_sync_job_list_push(&pool->run, job); } /* --------------------------------------------------------------------------- * ѯ̳߳Ƿڿת */ int davs2_threadpool_is_free(davs2_threadpool_t *pool) { return pool->run.i_size <= 0; } /* --------------------------------------------------------------------------- */ void *davs2_threadpool_wait(davs2_threadpool_t *pool, void *arg1, int arg2) { threadpool_job_t *job = NULL; void *ret; int i; davs2_thread_mutex_lock(&pool->done.mutex); /* lock */ while (!job) { for (i = 0; i < pool->done.i_size; i++) { threadpool_job_t *t = pool->done.list[i]; if (t->arg1 == arg1 && t->arg2 == arg2) { job = davs2_job_shift(pool->done.list + i); pool->done.i_size--; break; /* found the job according to arg */ } } if (!job) { davs2_thread_cond_wait(&pool->done.cv_fill, &pool->done.mutex); } } davs2_thread_mutex_unlock(&pool->done.mutex); /* unlock */ ret = job->ret; davs2_sync_job_list_push(&pool->uninit, job); return ret; } /* --------------------------------------------------------------------------- */ void davs2_threadpool_delete(davs2_threadpool_t *pool) { int i; davs2_thread_mutex_lock(&pool->run.mutex); /* lock */ pool->i_exit = AVS2_EXIT_THREAD; davs2_thread_cond_broadcast(&pool->run.cv_fill); davs2_thread_mutex_unlock(&pool->run.mutex); /* unlock */ for (i = 0; i < pool->num_total_threads; i++) { davs2_thread_join(pool->thread_handle[i], NULL); } davs2_threadpool_list_delete(&pool->uninit); davs2_threadpool_list_delete(&pool->run); davs2_threadpool_list_delete(&pool->done); davs2_free(pool); } davs2-1.6/source/common/threadpool.h000066400000000000000000000045141337322544400175140ustar00rootroot00000000000000/* * threadpool.h * * Description of this file: * thread pooling functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_THREADPOOL_H #define DAVS2_THREADPOOL_H #ifdef __cplusplus extern "C" { #endif typedef struct davs2_threadpool_t davs2_threadpool_t; typedef void *(*davs2_threadpool_func_t)(void *arg1, int arg2); #define davs2_threadpool_init FPFX(threadpool_init) int davs2_threadpool_init (davs2_threadpool_t **p_pool, int threads, davs2_threadpool_func_t init_func, void *init_arg1, int init_arg2); #define davs2_threadpool_run FPFX(threadpool_run) void davs2_threadpool_run (davs2_threadpool_t *pool, davs2_threadpool_func_t func, void *arg1, int arg2, int wait_sign); #define davs2_threadpool_is_free FPFX(threadpool_is_free) int davs2_threadpool_is_free(davs2_threadpool_t *pool); #define davs2_threadpool_wait FPFX(threadpool_wait) void *davs2_threadpool_wait (davs2_threadpool_t *pool, void *arg1, int arg2); #define davs2_threadpool_delete FPFX(threadpool_delete) void davs2_threadpool_delete(davs2_threadpool_t *pool); #ifdef __cplusplus } #endif #endif // __STARAVS_THREADPOOL_H davs2-1.6/source/common/transform.cc000066400000000000000000001177021337322544400175300ustar00rootroot00000000000000/* * transform.cc * * Description of this file: * Transform functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "common.h" #include "quant.h" #include "transform.h" #include "block_info.h" #if HAVE_MMX #include "vec/intrinsic.h" #include "x86/dct8.h" #endif /** * =========================================================================== * global & local variables * =========================================================================== */ /* --------------------------------------------------------------------------- * transform */ #define LOT_MAX_WLT_TAP 2 // number of wavelet transform tap, (5-3) /* --------------------------------------------------------------------------- */ static const int16_t g_T4[4][4] = { { 32, 32, 32, 32 }, { 42, 17, -17, -42 }, { 32, -32, -32, 32 }, { 17, -42, 42, -17 } }; /* --------------------------------------------------------------------------- */ static const int16_t g_T8[8][8] = { { 32, 32, 32, 32, 32, 32, 32, 32 }, { 44, 38, 25, 9, -9, -25, -38, -44 }, { 42, 17, -17, -42, -42, -17, 17, 42 }, { 38, -9, -44, -25, 25, 44, 9, -38 }, { 32, -32, -32, 32, 32, -32, -32, 32 }, { 25, -44, 9, 38, -38, -9, 44, -25 }, { 17, -42, 42, -17, -17, 42, -42, 17 }, { 9, -25, 38, -44, 44, -38, 25, -9 } }; /* --------------------------------------------------------------------------- */ static const int16_t g_T16[16][16] = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 45, 43, 40, 35, 29, 21, 13, 4, -4, -13, -21, -29, -35, -40, -43, -45 }, { 44, 38, 25, 9, -9, -25, -38, -44, -44, -38, -25, -9, 9, 25, 38, 44 }, { 43, 29, 4, -21, -40, -45, -35, -13, 13, 35, 45, 40, 21, -4, -29, -43 }, { 42, 17, -17, -42, -42, -17, 17, 42, 42, 17, -17, -42, -42, -17, 17, 42 }, { 40, 4, -35, -43, -13, 29, 45, 21, -21, -45, -29, 13, 43, 35, -4, -40 }, { 38, -9, -44, -25, 25, 44, 9, -38, -38, 9, 44, 25, -25, -44, -9, 38 }, { 35, -21, -43, 4, 45, 13, -40, -29, 29, 40, -13, -45, -4, 43, 21, -35 }, { 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32, 32, -32, -32, 32 }, { 29, -40, -13, 45, -4, -43, 21, 35, -35, -21, 43, 4, -45, 13, 40, -29 }, { 25, -44, 9, 38, -38, -9, 44, -25, -25, 44, -9, -38, 38, 9, -44, 25 }, { 21, -45, 29, 13, -43, 35, 4, -40, 40, -4, -35, 43, -13, -29, 45, -21 }, { 17, -42, 42, -17, -17, 42, -42, 17, 17, -42, 42, -17, -17, 42, -42, 17 }, { 13, -35, 45, -40, 21, 4, -29, 43, -43, 29, -4, -21, 40, -45, 35, -13 }, { 9, -25, 38, -44, 44, -38, 25, -9, -9, 25, -38, 44, -44, 38, -25, 9 }, { 4, -13, 21, -29, 35, -40, 43, -45, 45, -43, 40, -35, 29, -21, 13, -4 } }; /* --------------------------------------------------------------------------- */ static const int16_t g_T32[32][32] = { { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2, -2, -7,-11,-15,-19,-23,-27,-30,-34,-36,-39,-41,-43,-44,-45,-45 }, { 45, 43, 40, 35, 29, 21, 13, 4, -4,-13,-21,-29,-35,-40,-43,-45,-45,-43,-40,-35,-29,-21,-13, -4, 4, 13, 21, 29, 35, 40, 43, 45 }, { 45, 41, 34, 23, 11, -2,-15,-27,-36,-43,-45,-44,-39,-30,-19, -7, 7, 19, 30, 39, 44, 45, 43, 36, 27, 15, 2,-11,-23,-34,-41,-45 }, { 44, 38, 25, 9, -9,-25,-38,-44,-44,-38,-25, -9, 9, 25, 38, 44, 44, 38, 25, 9, -9,-25,-38,-44,-44,-38,-25, -9, 9, 25, 38, 44 }, { 44, 34, 15, -7,-27,-41,-45,-39,-23, -2, 19, 36, 45, 43, 30, 11,-11,-30,-43,-45,-36,-19, 2, 23, 39, 45, 41, 27, 7,-15,-34,-44 }, { 43, 29, 4,-21,-40,-45,-35,-13, 13, 35, 45, 40, 21, -4,-29,-43,-43,-29, -4, 21, 40, 45, 35, 13,-13,-35,-45,-40,-21, 4, 29, 43 }, { 43, 23, -7,-34,-45,-36,-11, 19, 41, 44, 27, -2,-30,-45,-39,-15, 15, 39, 45, 30, 2,-27,-44,-41,-19, 11, 36, 45, 34, 7, -23,-43 }, { 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42, 42, 17,-17,-42,-42,-17, 17, 42 }, { 41, 11,-27,-45,-30, 7, 39, 43, 15,-23,-45,-34, 2, 36, 44, 19,-19,-44,-36, -2, 34, 45, 23,-15,-43,-39, -7, 30, 45, 27,-11,-41 }, { 40, 4,-35,-43,-13, 29, 45, 21,-21,-45,-29, 13, 43, 35, -4,-40,-40, -4, 35, 43, 13,-29,-45,-21, 21, 45, 29,-13,-43,-35, 4, 40 }, { 39, -2,-41,-36, 7, 43, 34,-11,-44,-30, 15, 45, 27,-19,-45,-23, 23, 45, 19,-27,-45,-15, 30, 44, 11,-34,-43, -7, 36, 41, 2,-39 }, { 38, -9,-44,-25, 25, 44, 9,-38,-38, 9, 44, 25,-25,-44, -9, 38, 38, -9,-44,-25, 25, 44, 9,-38,-38, 9, 44, 25,-25,-44, -9, 38 }, { 36,-15,-45,-11, 39, 34,-19,-45, -7, 41, 30,-23,-44, -2, 43, 27,-27,-43, 2, 44, 23,-30,-41, 7, 45, 19,-34,-39, 11, 45, 15,-36 }, { 35,-21,-43, 4, 45, 13,-40,-29, 29, 40,-13,-45, -4, 43, 21,-35,-35, 21, 43, -4,-45,-13, 40, 29,-29,-40, 13, 45, 4,-43,-21, 35 }, { 34,-27,-39, 19, 43,-11,-45, 2, 45, 7,-44,-15, 41, 23,-36,-30, 30, 36,-23,-41, 15, 44, -7,-45, -2, 45, 11,-43,-19, 39, 27,-34 }, { 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32, 32,-32,-32, 32 }, { 30,-36,-23, 41, 15,-44, -7, 45, -2,-45, 11, 43,-19,-39, 27, 34,-34,-27, 39, 19,-43,-11, 45, 2,-45, 7, 44,-15,-41, 23, 36,-30 }, { 29,-40,-13, 45, -4,-43, 21, 35,-35,-21, 43, 4,-45, 13, 40,-29,-29, 40, 13,-45, 4, 43,-21,-35, 35, 21,-43, -4, 45,-13,-40, 29 }, { 27,-43, -2, 44,-23,-30, 41, 7,-45, 19, 34,-39,-11, 45,-15,-36, 36, 15,-45, 11, 39,-34,-19, 45, -7,-41, 30, 23,-44, 2, 43,-27 }, { 25,-44, 9, 38,-38, -9, 44,-25,-25, 44, -9,-38, 38, 9,-44, 25, 25,-44, 9, 38,-38, -9, 44,-25,-25, 44, -9,-38, 38, 9,-44, 25 }, { 23,-45, 19, 27,-45, 15, 30,-44, 11, 34,-43, 7, 36,-41, 2, 39,-39, -2, 41,-36, -7, 43,-34,-11, 44,-30,-15, 45,-27,-19, 45,-23 }, { 21,-45, 29, 13,-43, 35, 4,-40, 40, -4,-35, 43,-13,-29, 45,-21,-21, 45,-29,-13, 43,-35, -4, 40,-40, 4, 35,-43, 13, 29,-45, 21 }, { 19,-44, 36, -2,-34, 45,-23,-15, 43,-39, 7, 30,-45, 27, 11,-41, 41,-11,-27, 45,-30, -7, 39,-43, 15, 23,-45, 34, 2,-36, 44,-19 }, { 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17, 17,-42, 42,-17,-17, 42,-42, 17 }, { 15,-39, 45,-30, 2, 27,-44, 41,-19,-11, 36,-45, 34, -7,-23, 43,-43, 23, 7,-34, 45,-36, 11, 19,-41, 44,-27, -2, 30,-45, 39,-15 }, { 13,-35, 45,-40, 21, 4,-29, 43,-43, 29, -4,-21, 40,-45, 35,-13,-13, 35,-45, 40,-21, -4, 29,-43, 43,-29, 4, 21,-40, 45,-35, 13 }, { 11,-30, 43,-45, 36,-19, -2, 23,-39, 45,-41, 27, -7,-15, 34,-44, 44,-34, 15, 7,-27, 41,-45, 39,-23, 2, 19,-36, 45,-43, 30,-11 }, { 9,-25, 38,-44, 44,-38, 25, -9, -9, 25,-38, 44,-44, 38,-25, 9, 9,-25, 38,-44, 44,-38, 25, -9, -9, 25,-38, 44,-44, 38,-25, 9 }, { 7,-19, 30,-39, 44,-45, 43,-36, 27,-15, 2, 11,-23, 34,-41, 45,-45, 41,-34, 23,-11, -2, 15,-27, 36,-43, 45,-44, 39,-30, 19, -7 }, { 4,-13, 21,-29, 35,-40, 43,-45, 45,-43, 40,-35, 29,-21, 13, -4, -4, 13,-21, 29,-35, 40,-43, 45,-45, 43,-40, 35,-29, 21,-13, 4 }, { 2, -7, 11,-15, 19,-23, 27,-30, 34,-36, 39,-41, 43,-44, 45,-45, 45,-45, 44,-43, 41,-39, 36,-34, 30,-27, 23,-19, 15,-11, 7, -2 } }; /* --------------------------------------------------------------------------- */ ALIGN16(static const int16_t g_2T[SEC_TR_SIZE * SEC_TR_SIZE]) = { 123, -35, -8, -3, -32, -120, 30, 10, 14, 25, 123, -22, 8, 13, 19, 126 }; /* --------------------------------------------------------------------------- */ ALIGN16(static const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]) = { 34, 58, 72, 81, 77, 69, -7, -75, 79, -33, -75, 58, 55, -84, 73, -28 }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void partialButterflyInverse4_c(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[2], O[2]; const int max_val = (1 << (clip_depth - 1)) - 1; const int min_val = -max_val - 1; const int add = 1 << (shift - 1); int j; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ O[0] = g_T4[1][0] * src[line] + g_T4[3][0] * src[3 * line]; O[1] = g_T4[1][1] * src[line] + g_T4[3][1] * src[3 * line]; E[0] = g_T4[0][0] * src[0 ] + g_T4[2][0] * src[2 * line]; E[1] = g_T4[0][1] * src[0 ] + g_T4[2][1] * src[2 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ dst[0] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[0] + O[0] + add) >> shift)); dst[1] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[1] + O[1] + add) >> shift)); dst[2] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[1] - O[1] + add) >> shift)); dst[3] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[0] - O[0] + add) >> shift)); src++; dst += 4; } } /* --------------------------------------------------------------------------- */ static void idct_4x4_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 4 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse4_c( src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse4_c(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(dst, &block[i * BSIZE], BSIZE * sizeof(coeff_t)); dst += i_dst; } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse8_c(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[4], O[4]; int EE[2], EO[2]; const int max_val = (1 << (clip_depth - 1)) - 1; const int min_val = -max_val - 1; const int add = 1 << (shift - 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 4; k++) { O[k] = g_T8[1][k] * src[ line] + g_T8[3][k] * src[3 * line] + g_T8[5][k] * src[5 * line] + g_T8[7][k] * src[7 * line]; } EO[0] = g_T8[2][0] * src[2 * line] + g_T8[6][0] * src[6 * line]; EO[1] = g_T8[2][1] * src[2 * line] + g_T8[6][1] * src[6 * line]; EE[0] = g_T8[0][0] * src[0 ] + g_T8[4][0] * src[4 * line]; EE[1] = g_T8[0][1] * src[0 ] + g_T8[4][1] * src[4 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ E[0] = EE[0] + EO[0]; E[3] = EE[0] - EO[0]; E[1] = EE[1] + EO[1]; E[2] = EE[1] - EO[1]; for (k = 0; k < 4; k++) { dst[k] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[k] + O[k] + add) >> shift)); dst[k + 4] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[3 - k] - O[3 - k] + add) >> shift)); } src++; dst += 8; } } /* --------------------------------------------------------------------------- */ static void idct_8x8_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 8 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse8_c( src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse8_c(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[0], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); dst += i_dst; } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse16_c(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[8], O[8]; int EE[4], EO[4]; int EEE[2], EEO[2]; const int max_val = (1 << (clip_depth - 1)) - 1; const int min_val = -max_val - 1; const int add = 1 << (shift - 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 8; k++) { O[k] = g_T16[ 1][k] * src[ line] + g_T16[ 3][k] * src[ 3 * line] + g_T16[ 5][k] * src[ 5 * line] + g_T16[ 7][k] * src[ 7 * line] + g_T16[ 9][k] * src[ 9 * line] + g_T16[11][k] * src[11 * line] + g_T16[13][k] * src[13 * line] + g_T16[15][k] * src[15 * line]; } for (k = 0; k < 4; k++) { EO[k] = g_T16[ 2][k] * src[ 2 * line] + g_T16[ 6][k] * src[ 6 * line] + g_T16[10][k] * src[10 * line] + g_T16[14][k] * src[14 * line]; } EEO[0] = g_T16[4][0] * src[4 * line] + g_T16[12][0] * src[12 * line]; EEE[0] = g_T16[0][0] * src[0 ] + g_T16[ 8][0] * src[ 8 * line]; EEO[1] = g_T16[4][1] * src[4 * line] + g_T16[12][1] * src[12 * line]; EEE[1] = g_T16[0][1] * src[0 ] + g_T16[ 8][1] * src[ 8 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ for (k = 0; k < 2; k++) { EE[k ] = EEE[k ] + EEO[k ]; EE[k + 2] = EEE[1 - k] - EEO[1 - k]; } for (k = 0; k < 4; k++) { E[k ] = EE[k ] + EO[k ]; E[k + 4] = EE[3 - k] - EO[3 - k]; } for (k = 0; k < 8; k++) { dst[k] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[k] + O[k] + add) >> shift)); dst[k + 8] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[7 - k] - O[7 - k] + add) >> shift)); } src++; dst += 16; } } /* --------------------------------------------------------------------------- */ static void idct_16x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 16 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse16_c( src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse16_c(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[0], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); dst += i_dst; } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void partialButterflyInverse32_c(const coeff_t *src, coeff_t *dst, int shift, int line, int clip_depth) { int E[16], O[16]; int EE[8], EO[8]; int EEE[4], EEO[4]; int EEEE[2], EEEO[2]; const int max_val = (1 << (clip_depth - 1)) - 1; const int min_val = -max_val - 1; const int add = 1 << (shift - 1); int j, k; for (j = 0; j < line; j++) { /* utilizing symmetry properties to the maximum to * minimize the number of multiplications */ for (k = 0; k < 16; k++) { O[k] = g_T32[ 1][k] * src[ line] + g_T32[ 3][k] * src[ 3 * line] + g_T32[ 5][k] * src[ 5 * line] + g_T32[ 7][k] * src[ 7 * line] + g_T32[ 9][k] * src[ 9 * line] + g_T32[11][k] * src[11 * line] + g_T32[13][k] * src[13 * line] + g_T32[15][k] * src[15 * line] + g_T32[17][k] * src[17 * line] + g_T32[19][k] * src[19 * line] + g_T32[21][k] * src[21 * line] + g_T32[23][k] * src[23 * line] + g_T32[25][k] * src[25 * line] + g_T32[27][k] * src[27 * line] + g_T32[29][k] * src[29 * line] + g_T32[31][k] * src[31 * line]; } for (k = 0; k < 8; k++) { EO[k] = g_T32[ 2][k] * src[ 2 * line] + g_T32[ 6][k] * src[ 6 * line] + g_T32[10][k] * src[10 * line] + g_T32[14][k] * src[14 * line] + g_T32[18][k] * src[18 * line] + g_T32[22][k] * src[22 * line] + g_T32[26][k] * src[26 * line] + g_T32[30][k] * src[30 * line]; } for (k = 0; k < 4; k++) { EEO[k] = g_T32[ 4][k] * src[ 4 * line] + g_T32[12][k] * src[12 * line] + g_T32[20][k] * src[20 * line] + g_T32[28][k] * src[28 * line]; } EEEO[0] = g_T32[8][0] * src[8 * line] + g_T32[24][0] * src[24 * line]; EEEO[1] = g_T32[8][1] * src[8 * line] + g_T32[24][1] * src[24 * line]; EEEE[0] = g_T32[0][0] * src[0 ] + g_T32[16][0] * src[16 * line]; EEEE[1] = g_T32[0][1] * src[0 ] + g_T32[16][1] * src[16 * line]; /* combining even and odd terms at each hierarchy levels to * calculate the final spatial domain vector */ EEE[0] = EEEE[0] + EEEO[0]; EEE[3] = EEEE[0] - EEEO[0]; EEE[1] = EEEE[1] + EEEO[1]; EEE[2] = EEEE[1] - EEEO[1]; for (k = 0; k < 4; k++) { EE[k ] = EEE[k ] + EEO[k ]; EE[k + 4] = EEE[3 - k] - EEO[3 - k]; } for (k = 0; k < 8; k++) { E[k ] = EE[k ] + EO[k ]; E[k + 8] = EE[7 - k] - EO[7 - k]; } for (k = 0; k < 16; k++) { dst[k] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[k] + O[k] + add) >> shift)); dst[k + 16] = (coeff_t)DAVS2_CLIP3(min_val, max_val, ((E[15 - k] - O[15 - k] + add) >> shift)); } src++; dst += 32; } } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_32x32_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE 32 ALIGN32(coeff_t coeff[BSIZE * BSIZE]); ALIGN32(coeff_t block[BSIZE * BSIZE]); int a_flag = i_dst & 0x01; int shift1 = 5; int shift2 = 20 - g_bit_depth - a_flag; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + a_flag; int i; i_dst &= 0xFE; /* remember to remove the flag bit */ partialButterflyInverse32_c( src, coeff, shift1, BSIZE, clip_depth1); partialButterflyInverse32_c(coeff, block, shift2, BSIZE, clip_depth2); for (i = 0; i < BSIZE; i++) { memcpy(&dst[0], &block[i * BSIZE], BSIZE * sizeof(coeff_t)); dst += i_dst; } #undef BSIZE } /* --------------------------------------------------------------------------- */ static void idct_64x64_c(const coeff_t *src, coeff_t *dst, int i_dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 64 >> 1; int x, y, offset; /* step 0: idct 32x32 transform */ idct_32x32_c(src, dst, i_dst | 1); /* step 1: vertical transform */ for (x = 0; x < N0; x++) { /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += 32) { pExt[y << 1] = dst[x + offset]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N0; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N0; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { dst[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N0; y++, offset += N0) { /* copy */ for (x = 0; x < N1; x++) { pExt[x << 1] = dst[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(dst + offset, pExt, N0 * sizeof(coeff_t)); } } /* --------------------------------------------------------------------------- */ static void idct_16x4_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 16 #define BSIZE_V 4 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse4_c (src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse16_c(coeff, block, shift2, BSIZE_V, clip_depth2); for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- */ static void idct_4x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 4 #define BSIZE_V 16 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth; int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1; int i; partialButterflyInverse16_c(src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse4_c (coeff, block, shift2, BSIZE_V, clip_depth2); for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_32x8_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 32 #define BSIZE_V 8 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse8_c (src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse32_c(coeff, block, shift2, BSIZE_V, clip_depth2); i_dst &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- * NOTE: * i_dst - the stride of dst (the lowest bit is additional wavelet flag) */ static void idct_8x32_c(const coeff_t *src, coeff_t *dst, int i_dst) { #define BSIZE_H 8 #define BSIZE_V 32 ALIGN32(coeff_t coeff[BSIZE_H * BSIZE_V]); ALIGN32(coeff_t block[BSIZE_H * BSIZE_V]); int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i; partialButterflyInverse32_c(src, coeff, shift1, BSIZE_H, clip_depth1); partialButterflyInverse8_c (coeff, block, shift2, BSIZE_V, clip_depth2); i_dst &= 0xFE; for (i = 0; i < BSIZE_V; i++) { memcpy(&dst[i * i_dst], &block[i * BSIZE_H], BSIZE_H * sizeof(coeff_t)); } #undef BSIZE_H #undef BSIZE_V } /* --------------------------------------------------------------------------- */ static void idct_64x16_c(const coeff_t *src, coeff_t *dst, int i_dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 64; const int N1 = 16; int x, y, offset; /* step 0: idct 32x32 transform */ idct_32x8_c(src, dst, i_dst | 1); /* step 1: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += (N0 >> 1)) { pExt[y << 1] = dst[x + offset]; } /* reflection */ pExt[N1] = pExt[N1 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N1; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N1; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { dst[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ for (x = 0; x < N0 >> 1; x++) { pExt[x << 1] = dst[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(dst + offset, pExt, N0 * sizeof(coeff_t)); } } /* --------------------------------------------------------------------------- */ static void idct_16x64_c(const coeff_t *src, coeff_t *dst, int i_dst) { ALIGN32(coeff_t row_buf[64 + LOT_MAX_WLT_TAP * 2]); coeff_t *pExt = row_buf + LOT_MAX_WLT_TAP; const int N0 = 16; const int N1 = 64; int x, y, offset; /* step 0: idct 8x32 transform */ idct_8x32_c(src, dst, i_dst | 1); /* step 1: vertical transform */ for (x = 0; x < (N0 >> 1); x++) { /* copy */ for (y = 0, offset = 0; y < N1 >> 1; y++, offset += (N0 >> 1)) { pExt[y << 1] = dst[x + offset]; } /* reflection */ pExt[N1] = pExt[N1 - 2]; /* filtering (even pixel) */ for (y = 0; y <= N1; y += 2) { pExt[y] >>= 1; } /* filtering (odd pixel) */ for (y = 1; y < N1; y += 2) { pExt[y] = (pExt[y - 1] + pExt[y + 1]) >> 1; } /* copy */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { dst[x + offset] = pExt[y]; } } /* step 2: horizontal transform */ for (y = 0, offset = 0; y < N1; y++, offset += N0) { /* copy */ for (x = 0; x < N0 >> 1; x++) { pExt[x << 1] = dst[offset + x]; } /* reflection */ pExt[N0] = pExt[N0 - 2]; /* filtering (odd pixel) */ for (x = 1; x < N0; x += 2) { pExt[x] = (pExt[x - 1] + pExt[x + 1]) >> 1; } /* copy */ memcpy(dst + offset, pExt, N0 * sizeof(coeff_t)); } } /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Inv_Ver(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int add = 1 << (i_shift - 1); int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[k * SEC_TR_SIZE + i] * tmp_dct[k * SEC_TR_SIZE + j]; } coeff[i * i_coeff + j] = (coeff_t)DAVS2_CLIP3(-32768, 32767, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void xTr2nd_4_1d_Inv_Hor(coeff_t *coeff, int i_coeff, int i_shift, int clip_depth, const int16_t *tc) { int tmp_dct[SEC_TR_SIZE * SEC_TR_SIZE]; const int max_val = (1 << (clip_depth - 1)) - 1; const int min_val = -max_val - 1; const int add = 1 << (i_shift - 1); int i, j, k, sum; for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { tmp_dct[i * SEC_TR_SIZE + j] = coeff[i * i_coeff + j]; } } for (i = 0; i < SEC_TR_SIZE; i++) { for (j = 0; j < SEC_TR_SIZE; j++) { sum = add; for (k = 0; k < SEC_TR_SIZE; k++) { sum += tc[k * SEC_TR_SIZE + i] * tmp_dct[j * SEC_TR_SIZE + k]; } coeff[j * i_coeff + i] = (coeff_t)DAVS2_CLIP3(min_val, max_val, sum >> i_shift); } } } /* --------------------------------------------------------------------------- */ static void inv_transform_4x4_2nd_c(coeff_t *coeff, int i_coeff) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth + 2; const int clip_depth2 = g_bit_depth + 1; xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, shift1, g_2T_C); xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, shift2, clip_depth2, g_2T_C); } /* --------------------------------------------------------------------------- * i_mode - real intra mode (luma) * b_top - block top available? * b_left - block left available? */ static void inv_transform_2nd_c(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (ht && b_left) { xTr2nd_4_1d_Inv_Hor(coeff, i_coeff, 7, 16, g_2T); } if (vt && b_top) { xTr2nd_4_1d_Inv_Ver(coeff, i_coeff, 7, g_2T); } } /* --------------------------------------------------------------------------- */ static INLINE void inv_transform(davs2_row_rec_t *row_rec, coeff_t *p_coeff, cu_t *p_cu, int i_coeff, int bsx, int bsy, int b_secT, int blockidx, int i_luma_intra_mode) { int part_idx = PART_INDEX(bsx, bsy); dct_t idct = gf_davs2.idct[part_idx][p_cu->dct_pattern[blockidx]]; b_secT = b_secT && IS_INTRA(p_cu) && blockidx < 4; if (part_idx == PART_4x4) { if (b_secT) { gf_davs2.inv_transform_4x4_2nd(p_coeff, i_coeff); } else { idct(p_coeff, p_coeff, i_coeff); } } else { if (b_secT) { gf_davs2.inv_transform_2nd(p_coeff, i_coeff, i_luma_intra_mode, row_rec->b_block_avail_top, row_rec->b_block_avail_left); } idct(p_coeff, p_coeff, i_coeff); } } /* --------------------------------------------------------------------------- * copy region of h->lcu.residual[] corresponding to blockidx to p_dst */ static ALWAYS_INLINE coeff_t *get_quanted_coeffs(davs2_row_rec_t *row_rec, cu_t *p_cu, int blockidx) { int idx_cu_zscan = row_rec->idx_cu_zscan; coeff_t *p_res; if (blockidx < 4) { int block_offset = blockidx << ((p_cu->i_cu_level - 1) << 1); p_res = &row_rec->p_rec_info->coeff_buf_y[idx_cu_zscan << 6]; p_res += block_offset; } else { p_res = &row_rec->p_rec_info->coeff_buf_uv[blockidx - 4][idx_cu_zscan << 4]; } return p_res; } /* --------------------------------------------------------------------------- * get reconstruction pixels for blocks (include luma and chroma component) */ void davs2_get_recons(davs2_row_rec_t *row_rec, cu_t *p_cu, int blockidx, cb_t *p_tu, int ctu_x, int ctu_y) { int bsx = p_tu->w; int bsy = p_tu->h; int x_start = p_tu->x; int y_start = p_tu->y; int b_luma = blockidx < 4; int b_wavelet_conducted = (b_luma && p_cu->i_cu_level == B64X64_IN_BIT && p_cu->i_trans_size != TU_SPLIT_CROSS); coeff_t *p_coeff; pel_t *p_dst; int i_coeff; int i_dst; davs2_t *h = row_rec->h; assert(((p_cu->i_cbp >> blockidx) & 1) != 0); // inverse transform p_tu->v >>= b_wavelet_conducted; i_coeff = p_tu->w; p_coeff = get_quanted_coeffs(row_rec, p_cu, blockidx); inv_transform(row_rec, p_coeff, p_cu, i_coeff, bsx, bsy, h->seq_info.enable_2nd_transform, blockidx, p_cu->intra_pred_modes[blockidx]); i_coeff <<= b_wavelet_conducted; if (b_luma) { x_start += ctu_x; y_start += ctu_y; i_dst = row_rec->ctu.i_fdec[0]; p_dst = row_rec->ctu.p_fdec[0] + y_start * i_dst + x_start; } else { x_start = (ctu_x >> 1); y_start = (ctu_y >> 1); i_dst = row_rec->ctu.i_fdec[blockidx - 3]; p_dst = row_rec->ctu.p_fdec[blockidx - 3] + y_start * i_dst + x_start; } // normalize gf_davs2.add_ps[PART_INDEX(bsx, bsy)](p_dst, i_dst, p_dst, p_coeff, i_dst, i_coeff); } /* --------------------------------------------------------------------------- */ void davs2_dct_init(uint32_t cpuid, ao_funcs_t *fh) { int i; UNUSED_PARAMETER(cpuid); /* init c function handles */ fh->inv_transform_4x4_2nd = inv_transform_4x4_2nd_c; fh->inv_transform_2nd = inv_transform_2nd_c; for (i = 0; i < DCT_PATTERN_NUM; i++) { fh->idct[PART_4x4 ][i] = idct_4x4_c; fh->idct[PART_8x8 ][i] = idct_8x8_c; fh->idct[PART_16x16][i] = idct_16x16_c; fh->idct[PART_32x32][i] = idct_32x32_c; fh->idct[PART_64x64][i] = idct_64x64_c; fh->idct[PART_4x16 ][i] = idct_4x16_c; fh->idct[PART_8x32 ][i] = idct_8x32_c; fh->idct[PART_16x4 ][i] = idct_16x4_c; fh->idct[PART_32x8 ][i] = idct_32x8_c; fh->idct[PART_64x16][i] = idct_64x16_c; fh->idct[PART_16x64][i] = idct_16x64_c; } /* init asm function handles */ #if HAVE_MMX /* functions defined in file intrinsic_dct.c */ if (cpuid & DAVS2_CPU_SSE2) { fh->inv_transform_4x4_2nd = inv_transform_4x4_2nd_sse128; fh->inv_transform_2nd = inv_transform_2nd_sse128; for (i = 0; i < DCT_PATTERN_NUM; i++) { fh->idct[PART_4x4 ][i] = idct_4x4_sse128; fh->idct[PART_8x8 ][i] = idct_8x8_sse128; fh->idct[PART_16x16][i] = idct_16x16_sse128; fh->idct[PART_32x32][i] = idct_32x32_sse128; fh->idct[PART_64x64][i] = idct_64x64_sse128; fh->idct[PART_64x16][i] = idct_64x16_sse128; fh->idct[PART_16x64][i] = idct_16x64_sse128; fh->idct[PART_4x16][i] = idct_4x16_sse128; fh->idct[PART_8x32][i] = idct_8x32_sse128; fh->idct[PART_16x4][i] = idct_16x4_sse128; fh->idct[PART_32x8][i] = idct_32x8_sse128; #if !HIGH_BIT_DEPTH fh->idct[PART_4x4 ][i] = FPFX(idct_4x4_sse2); #if ARCH_X86_64 fh->idct[PART_8x8 ][i] = FPFX(idct_8x8_sse2); #endif #endif } } if (cpuid & DAVS2_CPU_SSSE3) { for (i = 0; i < DCT_PATTERN_NUM; i++) { #if HIGH_BIT_DEPTH // 10bit assemble #else fh->idct[PART_8x8 ][i] = davs2_idct_8x8_ssse3; #endif } } /* TODO: ʼĬDCTģ */ if (cpuid & DAVS2_CPU_SSE2) { /* square */ fh->idct[PART_8x8 ][DCT_HALF] = idct_8x8_half_sse128; fh->idct[PART_8x8 ][DCT_QUAD] = idct_8x8_quad_sse128; fh->idct[PART_16x16][DCT_HALF] = idct_16x16_half_sse128; fh->idct[PART_16x16][DCT_QUAD] = idct_16x16_quad_sse128; fh->idct[PART_32x32][DCT_HALF] = idct_32x32_half_sse128; fh->idct[PART_32x32][DCT_QUAD] = idct_32x32_quad_sse128; fh->idct[PART_64x64][DCT_HALF] = idct_64x64_half_sse128; fh->idct[PART_64x64][DCT_QUAD] = idct_64x64_quad_sse128; /* non-square */ fh->idct[PART_4x16 ][DCT_HALF] = idct_4x16_half_sse128; fh->idct[PART_4x16 ][DCT_QUAD] = idct_4x16_quad_sse128; fh->idct[PART_16x4 ][DCT_HALF] = idct_16x4_half_sse128; fh->idct[PART_16x4 ][DCT_QUAD] = idct_16x4_quad_sse128; fh->idct[PART_8x32 ][DCT_QUAD] = idct_8x32_quad_sse128; fh->idct[PART_8x32 ][DCT_HALF] = idct_8x32_half_sse128; fh->idct[PART_32x8 ][DCT_HALF] = idct_32x8_half_sse128; fh->idct[PART_32x8 ][DCT_QUAD] = idct_32x8_quad_sse128; fh->idct[PART_16x64][DCT_HALF] = idct_16x64_half_sse128; fh->idct[PART_16x64][DCT_QUAD] = idct_16x64_quad_sse128; fh->idct[PART_64x16][DCT_HALF] = idct_64x16_half_sse128; fh->idct[PART_64x16][DCT_QUAD] = idct_64x16_quad_sse128; } #if ARCH_X86_64 if (cpuid & DAVS2_CPU_AVX2) { fh->idct[PART_8x8 ][DCT_DEAULT] = idct_8x8_avx2; fh->idct[PART_16x16][DCT_DEAULT] = idct_16x16_avx2; fh->idct[PART_64x64][DCT_DEAULT] = idct_64x64_avx2; fh->idct[PART_64x16][DCT_DEAULT] = idct_64x16_avx2; fh->idct[PART_16x64][DCT_DEAULT] = idct_16x64_avx2; fh->idct[PART_32x32][DCT_DEAULT] = idct_32x32_avx2; // @luofl i7-6700k ٶȱsse128һ /* square */ // fh->idct[PART_8x8 ][DCT_HALF] = idct_8x8_half_avx2; // fh->idct[PART_8x8 ][DCT_QUAD] = idct_8x8_quad_avx2; // fh->idct[PART_16x16][DCT_HALF] = idct_16x16_half_avx2; // fh->idct[PART_16x16][DCT_QUAD] = idct_16x16_quad_avx2; // fh->idct[PART_32x32][DCT_HALF] = idct_32x32_half_avx2; // fh->idct[PART_32x32][DCT_QUAD] = idct_32x32_quad_avx2; // fh->idct[PART_64x64][DCT_HALF] = idct_64x64_half_avx2; // fh->idct[PART_64x64][DCT_QUAD] = idct_64x64_quad_avx2; /* non-square */ // fh->idct[PART_4x16 ][DCT_HALF] = idct_4x16_half_avx2; // fh->idct[PART_4x16 ][DCT_QUAD] = idct_4x16_quad_avx2; // fh->idct[PART_16x4 ][DCT_HALF] = idct_16x4_half_avx2; // fh->idct[PART_16x4 ][DCT_QUAD] = idct_16x4_quad_avx2; // fh->idct[PART_8x32 ][DCT_QUAD] = idct_8x32_quad_avx2; // fh->idct[PART_8x32 ][DCT_HALF] = idct_8x32_half_avx2; // fh->idct[PART_32x8 ][DCT_HALF] = idct_32x8_half_avx2; // fh->idct[PART_32x8 ][DCT_QUAD] = idct_32x8_quad_avx2; // fh->idct[PART_16x64][DCT_HALF] = idct_16x64_half_avx2; // fh->idct[PART_16x64][DCT_QUAD] = idct_16x64_quad_avx2; // fh->idct[PART_64x16][DCT_HALF] = idct_64x16_half_avx2; // fh->idct[PART_64x16][DCT_QUAD] = idct_64x16_quad_avx2; } #endif // if ARCH_X86_X64 #endif // if HAVE_MMX } davs2-1.6/source/common/transform.h000066400000000000000000000032771337322544400173730ustar00rootroot00000000000000/* * transform.h * * Description of this file: * Transform functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_TRANSFORM_H #define DAVS2_TRANSFORM_H #ifdef __cplusplus extern "C" { #endif #define davs2_dct_init FPFX(dct_init) void davs2_dct_init(uint32_t cpuid, ao_funcs_t *fh); #define davs2_get_recons FPFX(get_recons) void davs2_get_recons(davs2_row_rec_t *row_rec, cu_t *p_cu, int blockidx, cb_t *p_tu, int ctu_x, int ctu_y); #ifdef __cplusplus } #endif #endif // DAVS2_TRANSFORM_H davs2-1.6/source/common/vec/000077500000000000000000000000001337322544400157535ustar00rootroot00000000000000davs2-1.6/source/common/vec/intrinsic.cc000066400000000000000000000730661337322544400203000ustar00rootroot00000000000000/* * intrinsic.cc * * Description of this file: * tables used in SIMD assembly functions of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #if HIGH_BIT_DEPTH ALIGN32(const int16_t intrinsic_mask_10bit[15][16]) = { { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } }; #else ALIGN32(const int8_t intrinsic_mask[15][16]) = { { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } }; #endif // #if !HIGH_BIT_DEPTH ALIGN32(const int8_t intrinsic_mask_256_8bit[16][32]) = { { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; ALIGN32(const int8_t intrinsic_mask32[32][32]) = { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 } }; ALIGN32(const int8_t tab_log2[65]) = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6 }; const uint8_t tab_idx_mode_7[64] = { 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 23, 24, 25, 26, 26, 27, 28, 29, 29, 30, 31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 39, 40, 41, 42, 42, 43, 44, 45, 45, 46 }; ALIGN16(const pel_t tab_coeff_mode_7[64][16]) = { { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 },//0 { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 },//8 { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 },//16 { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 },//24 { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 },//32 { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 },//40 { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 },//48 { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7 }, { 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 },//56 { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10 }, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }//63 }; ALIGN32(const pel_t tab_coeff_mode_7_avx[64][32]) = { { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23},//0 { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29}, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26}, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17},//8 { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23}, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11},//16 { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17}, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5},//24 { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31},//32 { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13}, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25},//40 { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7}, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13}, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4}, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19},//48 { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1}, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}, { 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7, 25, 57, 39, 7}, { 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30, 2, 34, 62, 30}, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22}, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13},//56 { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4}, { 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27, 5, 37, 59, 27}, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19}, { 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10, 22, 54, 42, 10}, { 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1, 31, 63, 33, 1}, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24}, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16}//63 }; #if HIGH_BIT_DEPTH ALIGN16(const int16_t tab_coeff_mode_9[64][16]) = { #else ALIGN16(const int8_t tab_coeff_mode_9[64][16]) = { #endif { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29, 3, 35, 61, 29 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26, 6, 38, 58, 26 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23, 9, 41, 55, 23 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17, 15, 47, 49, 17 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14, 18, 50, 46, 14 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19, 13, 45, 51, 19 }, { 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31, 1, 33, 63, 31 }, { 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11, 21, 53, 43, 11 }, { 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22, 10, 42, 54, 22 }, { 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2, 30, 62, 34, 2 }, { 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13, 19, 51, 45, 13 }, { 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25, 7, 39, 57, 25 }, { 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5, 27, 59, 37, 5 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 } }; const uint8_t tab_idx_mode_9[64] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 22, 23 }; #if HIGH_BIT_DEPTH const ALIGN16(int16_t tab_coeff_mode_11[64][16]) = { #else const ALIGN16(int8_t tab_coeff_mode_11[64][16]) = { #endif { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0 } }; davs2-1.6/source/common/vec/intrinsic.h000066400000000000000000001135771337322544400201440ustar00rootroot00000000000000/* * intrinsic.h * * Description of this file: * SIMD assembly functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_INTRINSIC_H #define DAVS2_INTRINSIC_H #ifdef __cplusplus extern "C" { #endif #if !defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define __int64 long long #endif /* --------------------------------------------------------------------------- * global variables */ #define intrinsic_mask FPFX(intrinsic_mask) ALIGN32(extern const int8_t intrinsic_mask[15][16]); #define intrinsic_mask_256_8bit FPFX(intrinsic_mask_256_8bit) ALIGN32(extern const int8_t intrinsic_mask_256_8bit[16][32]); #define intrinsic_mask32 FPFX(intrinsic_mask32) ALIGN32(extern const int8_t intrinsic_mask32[32][32]); #define intrinsic_mask_10bit FPFX(intrinsic_mask_10bit) ALIGN32(extern const int16_t intrinsic_mask_10bit[15][16]); #define tab_log2 FPFX(tab_log2) ALIGN32(extern const int8_t tab_log2[65]); #define tab_coeff_mode_7 FPFX(tab_coeff_mode_7) ALIGN16(extern const pel_t tab_coeff_mode_7[64][16]); #define tab_idx_mode_7 FPFX(tab_idx_mode_7) ALIGN32(extern const uint8_t tab_idx_mode_7[64]); #define tab_coeff_mode_7_avx FPFX(tab_coeff_mode_7_avx) ALIGN32(extern const pel_t tab_coeff_mode_7_avx[64][32]); #if HIGH_BIT_DEPTH #define tab_coeff_mode_9 FPFX(tab_coeff_mode_9) ALIGN16(extern const int16_t tab_coeff_mode_9[64][16]); #else #define tab_coeff_mode_9 FPFX(tab_coeff_mode_9) ALIGN16(extern const int8_t tab_coeff_mode_9[64][16]); #endif #define tab_idx_mode_9 FPFX(tab_idx_mode_9) extern const uint8_t tab_idx_mode_9[64]; #if HIGH_BIT_DEPTH #define tab_coeff_mode_11 FPFX(tab_coeff_mode_11) ALIGN16(extern const int16_t tab_coeff_mode_11[64][16]); #else #define tab_coeff_mode_11 FPFX(tab_coeff_mode_11) ALIGN16(extern const int8_t tab_coeff_mode_11[64][16]); #endif /* --------------------------------------------------------------------------- * macros used for quick access of __m128i */ #define M128_U64(mx, idx) *(((uint64_t *)&mx) + idx) #define M128_U32(mx, idx) *(((uint32_t *)&mx) + idx) #define M128_I32(mx, idx) *((( int32_t *)&mx) + idx) #define M128_U16(mx, idx) *(((uint16_t *)&mx) + idx) #define M128_I16(mx, idx) *((( int16_t *)&mx) + idx) #if _MSC_VER //Ӻ궨 ǰimmintrin.hûжЩ zhangjiaqi 2016-12-02 #define _mm256_extract_epi64(a, i) (a.m256i_i64[i]) #define _mm256_extract_epi32(a, i) (a.m256i_i32[i]) #define _mm256_extract_epi16(a, i) (a.m256i_i16[i]) #define _mm256_extract_epi8(a, i) (a.m256i_i8 [i]) #define _mm256_insert_epi64(a, v, i) (a.m256i_i64[i] = v) #define _mm_extract_epi64(r, i) r.m128i_i64[i] #else // Ӳgccȱٵavx #define _mm256_set_m128i(/* __m128i */ hi, /* __m128i */ lo) \ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1) #define _mm256_loadu2_m128i(/* __m128i const* */ hiaddr, \ /* __m128i const* */ loaddr) \ _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr)) #define _mm256_storeu2_m128i(/* __m128i* */ hiaddr, /* __m128i* */ loaddr, \ /* __m256i */ a) \ do { \ __m256i _a = (a); /* reference a only once in macro body */ \ _mm_storeu_si128((loaddr), _mm256_castsi256_si128(_a)); \ _mm_storeu_si128((hiaddr), _mm256_extractf128_si256(_a, 0x1)); \ } while (0) #endif #define davs2_memzero_aligned_c_sse2 FPFX(memzero_aligned_c_sse2) void *davs2_memzero_aligned_c_sse2(void *dst, size_t n); #define davs2_memzero_aligned_c_avx FPFX(memzero_aligned_c_avx) void *davs2_memzero_aligned_c_avx (void *dst, size_t n); #define davs2_memcpy_aligned_c_sse2 FPFX(memcpy_aligned_c_sse2) void *davs2_memcpy_aligned_c_sse2 (void *dst, const void *src, size_t n); #define davs2_memcpy_aligned_mmx FPFX(memcpy_aligned_mmx) void *davs2_memcpy_aligned_mmx(void *dst, const void *src, size_t n); #define davs2_memcpy_aligned_sse FPFX(memcpy_aligned_sse) void *davs2_memcpy_aligned_sse(void *dst, const void *src, size_t n); #define davs2_fast_memcpy_mmx FPFX(fast_memcpy_mmx) void *davs2_fast_memcpy_mmx(void *dst, const void *src, size_t n); #define davs2_fast_memset_mmx FPFX(fast_memset_mmx) void *davs2_fast_memset_mmx(void *dst, int val, size_t n); #define davs2_memzero_aligned_mmx FPFX(memzero_aligned_mmx) void *davs2_memzero_aligned_mmx (void *dst, size_t n); #define davs2_memzero_aligned_sse FPFX(memzero_aligned_sse) void *davs2_memzero_aligned_sse (void *dst, size_t n); #define davs2_memzero_aligned_avx FPFX(memzero_aligned_avx) void *davs2_memzero_aligned_avx (void *dst, size_t n); #define davs2_fast_memzero_mmx FPFX(fast_memzero_mmx) void *davs2_fast_memzero_mmx (void *dst, size_t n); #define plane_copy_c_sse2 FPFX(plane_copy_c_sse2) void plane_copy_c_sse2 (pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h); #define intpl_copy_block_sse128 FPFX(intpl_copy_block_sse128) void intpl_copy_block_sse128 (pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height); #define intpl_luma_block_hor_sse128 FPFX(intpl_luma_block_hor_sse128) void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_sse128 FPFX(intpl_luma_block_ver_sse128) void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver0_sse128 FPFX(intpl_luma_block_ver0_sse128) void intpl_luma_block_ver0_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver1_sse128 FPFX(intpl_luma_block_ver1_sse128) void intpl_luma_block_ver1_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver2_sse128 FPFX(intpl_luma_block_ver2_sse128) void intpl_luma_block_ver2_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ext_sse128 FPFX(intpl_luma_block_ext_sse128) void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_chroma_block_hor_sse128 FPFX(intpl_chroma_block_hor_sse128) void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ver_sse128 FPFX(intpl_chroma_block_ver_sse128) void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ext_sse128 FPFX(intpl_chroma_block_ext_sse128) void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define intpl_luma_block_hor_avx2 FPFX(intpl_luma_block_hor_avx2) void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver_avx2 FPFX(intpl_luma_block_ver_avx2) void intpl_luma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver0_avx2 FPFX(intpl_luma_block_ver0_avx2) void intpl_luma_block_ver0_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver1_avx2 FPFX(intpl_luma_block_ver1_avx2) void intpl_luma_block_ver1_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ver2_avx2 FPFX(intpl_luma_block_ver2_avx2) void intpl_luma_block_ver2_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_luma_block_ext_avx2 FPFX(intpl_luma_block_ext_avx2) void intpl_luma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); /* --------------------------------------------------------------------------- */ #define intpl_luma_hor_sse128 FPFX(intpl_luma_hor_sse128) void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff); #define intpl_luma_hor_x3_sse128 FPFX(intpl_luma_hor_x3_sse128) void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff); #define intpl_luma_ver_x3_sse128 FPFX(intpl_luma_ver_x3_sse128) void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff); #define intpl_luma_ext_x3_sse128 FPFX(intpl_luma_ext_x3_sse128) void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff); #define intpl_luma_ext_sse128 FPFX(intpl_luma_ext_sse128) void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff); #define avs_pixel_average_sse128 FPFX(avs_pixel_average_sse128) void avs_pixel_average_sse128 (pel_t *dst, int i_dst, const pel_t *src0, int i_src0, const pel_t *src1, int i_src1, int width, int height); #define davs2_pixel_average_avx FPFX(pixel_average_avx) void davs2_pixel_average_avx (pel_t *dst, int i_dst, const pel_t *src1, int i_src1, const pel_t *src2, int i_src2, int width, int height); #define padding_rows_sse128 FPFX(padding_rows_sse128) void padding_rows_sse128 (pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define padding_rows_lr_sse128 FPFX(padding_rows_lr_sse128) void padding_rows_lr_sse128 (pel_t *src, int i_src, int width, int height, int start, int rows, int pad); #define intpl_chroma_block_hor_avx2 FPFX(intpl_chroma_block_hor_avx2) void intpl_chroma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ver_avx2 FPFX(intpl_chroma_block_ver_avx2) void intpl_chroma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff); #define intpl_chroma_block_ext_avx2 FPFX(intpl_chroma_block_ext_avx2) void intpl_chroma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff_h, const int8_t *coeff_v); #define deblock_edge_ver_sse128 FPFX(deblock_edge_ver_sse128) void deblock_edge_ver_sse128 (pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_hor_sse128 FPFX(deblock_edge_hor_sse128) void deblock_edge_hor_sse128 (pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #if HDR_CHROMA_DELTA_QP #define deblock_edge_ver_c_sse128 FPFX(deblock_edge_ver_c_sse128) void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int *Alpha, int *Beta, uint8_t *flt_flag); #define deblock_edge_hor_c_sse128 FPFX(deblock_edge_hor_c_sse128) void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int *Alpha, int *Beta, uint8_t *flt_flag); #else #define deblock_edge_ver_c_sse128 FPFX(deblock_edge_ver_c_sse128) void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_hor_c_sse128 FPFX(deblock_edge_hor_c_sse128) void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #endif //--------avx2-------- #define deblock_edge_hor_avx2 FPFX(deblock_edge_hor_avx2) void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_avx2 FPFX(deblock_edge_ver_avx2) void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_hor_c_avx2 FPFX(deblock_edge_hor_c_avx2) void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define deblock_edge_ver_c_avx2 FPFX(deblock_edge_ver_c_avx2) void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag); #define davs2_dequant_sse4 FPFX(dequant_sse4) void davs2_dequant_sse4(coeff_t *coef, const int i_coef, const int scale, const int shift); #define idct_4x4_sse128 FPFX(idct_4x4_sse128) void idct_4x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x8_sse128 FPFX(idct_8x8_sse128) void idct_8x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_sse128 FPFX(idct_16x16_sse128) void idct_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_sse128 FPFX(idct_32x32_sse128) void idct_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_sse128 FPFX(idct_64x64_sse128) void idct_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x4_sse128 FPFX(idct_16x4_sse128) void idct_16x4_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x8_sse128 FPFX(idct_32x8_sse128) void idct_32x8_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_sse128 FPFX(idct_64x16_sse128) void idct_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x16_sse128 FPFX(idct_4x16_sse128) void idct_4x16_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x32_sse128 FPFX(idct_8x32_sse128) void idct_8x32_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_sse128 FPFX(idct_16x64_sse128) void idct_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define inv_transform_4x4_2nd_sse128 FPFX(inv_transform_4x4_2nd_sse128) void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff); #define inv_transform_2nd_sse128 FPFX(inv_transform_2nd_sse128) void inv_transform_2nd_sse128 (coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left); #define inv_wavelet_64x16_sse128 FPFX(inv_wavelet_64x16_sse128) void inv_wavelet_64x16_sse128(coeff_t *coeff); #define inv_wavelet_16x64_sse128 FPFX(inv_wavelet_16x64_sse128) void inv_wavelet_16x64_sse128(coeff_t *coeff); //futl add 2016.11.30 avx2 #define idct_8x8_avx2 FPFX(vec_idct_8x8_avx2) void idct_8x8_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_avx2 FPFX(vec_idct_16x16_avx2) void idct_16x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_avx2 FPFX(vec_idct_32x32_avx2) void idct_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_avx2 FPFX(vec_idct_64x64_avx2) void idct_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_avx2 FPFX(vec_idct_64x16_avx2) void idct_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_avx2 FPFX(vec_idct_16x64_avx2) void idct_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define inv_wavelet_64x16_avx2 FPFX(inv_wavelet_64x16_avx2) void inv_wavelet_64x16_avx2(coeff_t *coeff); #define inv_wavelet_16x64_avx2 FPFX(inv_wavelet_16x64_avx2) void inv_wavelet_16x64_avx2(coeff_t *coeff); #define inv_wavelet_64x64_avx2 FPFX(inv_wavelet_64x64_avx2) void inv_wavelet_64x64_avx2(coeff_t *coeff); /* DCT half and quad */ #define idct_4x4_half_sse128 FPFX(idct_4x4_half_sse128) void idct_4x4_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x8_half_sse128 FPFX(idct_8x8_half_sse128) void idct_8x8_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_half_sse128 FPFX(idct_16x16_half_sse128) void idct_16x16_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_half_sse128 FPFX(idct_32x32_half_sse128) void idct_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_half_sse128 FPFX(idct_64x64_half_sse128) void idct_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x4_half_sse128 FPFX(idct_16x4_half_sse128) void idct_16x4_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x8_half_sse128 FPFX(idct_32x8_half_sse128) void idct_32x8_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x16_half_sse128 FPFX(idct_4x16_half_sse128) void idct_4x16_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x32_half_sse128 FPFX(idct_8x32_half_sse128) void idct_8x32_half_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_half_sse128 FPFX(idct_16x64_half_sse128) void idct_16x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_half_sse128 FPFX(idct_64x16_half_sse128) void idct_64x16_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x4_quad_sse128 FPFX(idct_4x4_quad_sse128) void idct_4x4_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x8_quad_sse128 FPFX(idct_8x8_quad_sse128) void idct_8x8_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_quad_sse128 FPFX(idct_16x16_quad_sse128) void idct_16x16_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_quad_sse128 FPFX(idct_32x32_quad_sse128) void idct_32x32_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_quad_sse128 FPFX(idct_64x64_quad_sse128) void idct_64x64_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x4_quad_sse128 FPFX(idct_16x4_quad_sse128) void idct_16x4_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x8_quad_sse128 FPFX(idct_32x8_quad_sse128) void idct_32x8_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x16_quad_sse128 FPFX(idct_4x16_quad_sse128) void idct_4x16_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x32_quad_sse128 FPFX(idct_8x32_quad_sse128) void idct_8x32_quad_sse128 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_quad_sse128 FPFX(idct_16x64_quad_sse128) void idct_16x64_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_quad_sse128 FPFX(idct_64x16_quad_sse128) void idct_64x16_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x8_half_avx2 FPFX(idct_8x8_half_avx2) void idct_8x8_half_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_half_avx2 FPFX(idct_16x16_half_avx2) void idct_16x16_half_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_half_avx2 FPFX(idct_32x32_half_avx2) void idct_32x32_half_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_half_avx2 FPFX(idct_64x64_half_avx2) void idct_64x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x4_half_avx2 FPFX(idct_16x4_half_avx2) void idct_16x4_half_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x8_half_avx2 FPFX(idct_32x8_half_avx2) void idct_32x8_half_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x16_half_avx2 FPFX(idct_4x16_half_avx2) void idct_4x16_half_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x32_half_avx2 FPFX(idct_8x32_half_avx2) void idct_8x32_half_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_half_avx2 FPFX(idct_16x64_half_avx2) void idct_16x64_half_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_half_avx2 FPFX(idct_64x16_half_avx2) void idct_64x16_half_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x8_quad_avx2 FPFX(idct_8x8_quad_avx2) void idct_8x8_quad_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x16_quad_avx2 FPFX(idct_16x16_quad_avx2) void idct_16x16_quad_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x32_quad_avx2 FPFX(idct_32x32_quad_avx2) void idct_32x32_quad_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x64_quad_avx2 FPFX(idct_64x64_quad_avx2) void idct_64x64_quad_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x4_quad_avx2 FPFX(idct_16x4_quad_avx2) void idct_16x4_quad_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_32x8_quad_avx2 FPFX(idct_32x8_quad_avx2) void idct_32x8_quad_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_4x16_quad_avx2 FPFX(idct_4x16_quad_avx2) void idct_4x16_quad_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_8x32_quad_avx2 FPFX(idct_8x32_quad_avx2) void idct_8x32_quad_avx2 (const coeff_t *src, coeff_t *dst, int i_dst); #define idct_16x64_quad_avx2 FPFX(idct_16x64_quad_avx2) void idct_16x64_quad_avx2(const coeff_t *src, coeff_t *dst, int i_dst); #define idct_64x16_quad_avx2 FPFX(idct_64x16_quad_avx2) void idct_64x16_quad_avx2(const coeff_t *src, coeff_t *dst, int i_dst); /* --------------------------------------------------------------------------- * SAO */ #define SAO_on_block_bo_sse128 FPFX(SAO_on_block_bo_sse128) void SAO_on_block_bo_sse128 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param); #define SAO_on_block_eo_0_sse128 FPFX(SAO_on_block_eo_0_sse128) void SAO_on_block_eo_0_sse128 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_45_sse128 FPFX(SAO_on_block_eo_45_sse128) void SAO_on_block_eo_45_sse128 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_90_sse128 FPFX(SAO_on_block_eo_90_sse128) void SAO_on_block_eo_90_sse128 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_135_sse128 FPFX(SAO_on_block_eo_135_sse128) void SAO_on_block_eo_135_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_bo_avx2 FPFX(SAO_on_block_bo_avx2) void SAO_on_block_bo_avx2 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param); #define SAO_on_block_eo_0_avx2 FPFX(SAO_on_block_eo_0_avx2) void SAO_on_block_eo_0_avx2 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_45_avx2 FPFX(SAO_on_block_eo_45_avx2) void SAO_on_block_eo_45_avx2 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_90_avx2 FPFX(SAO_on_block_eo_90_avx2) void SAO_on_block_eo_90_avx2 (pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); #define SAO_on_block_eo_135_avx2 FPFX(SAO_on_block_eo_135_avx2) void SAO_on_block_eo_135_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset); /* --------------------------------------------------------------------------- * ALF */ #define alf_filter_block_sse128 FPFX(alf_filter_block_sse128) void alf_filter_block_sse128(pel_t *p_dst, const pel_t *p_src, int stride, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coef, int b_top_avail, int b_down_avail); /* --------------------------------------------------------------------------- * Intra Prediction */ #define fill_edge_samples_0_sse128 FPFX(fill_edge_samples_0_sse128) void fill_edge_samples_0_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_x_sse128 FPFX(fill_edge_samples_x_sse128) void fill_edge_samples_x_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_y_sse128 FPFX(fill_edge_samples_y_sse128) void fill_edge_samples_y_sse128 (const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define fill_edge_samples_xy_sse128 FPFX(fill_edge_samples_xy_sse128) void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy); #define intra_pred_dc_sse128 FPFX(intra_pred_dc_sse128) void intra_pred_dc_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_plane_sse128 FPFX(intra_pred_plane_sse128) void intra_pred_plane_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_bilinear_sse128 FPFX(intra_pred_bilinear_sse128) void intra_pred_bilinear_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_sse128 FPFX(intra_pred_hor_sse128) void intra_pred_hor_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ver_sse128 FPFX(intra_pred_ver_sse128) void intra_pred_ver_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_3_sse128 FPFX(intra_pred_ang_x_3_sse128) void intra_pred_ang_x_3_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_4_sse128 FPFX(intra_pred_ang_x_4_sse128) void intra_pred_ang_x_4_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_5_sse128 FPFX(intra_pred_ang_x_5_sse128) void intra_pred_ang_x_5_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_6_sse128 FPFX(intra_pred_ang_x_6_sse128) void intra_pred_ang_x_6_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_7_sse128 FPFX(intra_pred_ang_x_7_sse128) void intra_pred_ang_x_7_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_8_sse128 FPFX(intra_pred_ang_x_8_sse128) void intra_pred_ang_x_8_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_9_sse128 FPFX(intra_pred_ang_x_9_sse128) void intra_pred_ang_x_9_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_10_sse128 FPFX(intra_pred_ang_x_10_sse128) void intra_pred_ang_x_10_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_11_sse128 FPFX(intra_pred_ang_x_11_sse128) void intra_pred_ang_x_11_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_25_sse128 FPFX(intra_pred_ang_y_25_sse128) void intra_pred_ang_y_25_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_26_sse128 FPFX(intra_pred_ang_y_26_sse128) void intra_pred_ang_y_26_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_27_sse128 FPFX(intra_pred_ang_y_27_sse128) void intra_pred_ang_y_27_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_28_sse128 FPFX(intra_pred_ang_y_28_sse128) void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_29_sse128 FPFX(intra_pred_ang_y_29_sse128) void intra_pred_ang_y_29_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_30_sse128 FPFX(intra_pred_ang_y_30_sse128) void intra_pred_ang_y_30_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_31_sse128 FPFX(intra_pred_ang_y_31_sse128) void intra_pred_ang_y_31_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_sse128 FPFX(intra_pred_ang_y_32_sse128) void intra_pred_ang_y_32_sse128 (pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_13_sse128 FPFX(intra_pred_ang_xy_13_sse128) void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_14_sse128 FPFX(intra_pred_ang_xy_14_sse128) void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_16_sse128 FPFX(intra_pred_ang_xy_16_sse128) void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_18_sse128 FPFX(intra_pred_ang_xy_18_sse128) void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_20_sse128 FPFX(intra_pred_ang_xy_20_sse128) void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_22_sse128 FPFX(intra_pred_ang_xy_22_sse128) void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_23_sse128 FPFX(intra_pred_ang_xy_23_sse128) void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); //intra prediction avx functions #define intra_pred_ver_avx FPFX(intra_pred_ver_avx) void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_hor_avx FPFX(intra_pred_hor_avx) void intra_pred_hor_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_dc_avx FPFX(intra_pred_dc_avx) void intra_pred_dc_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_plane_avx FPFX(intra_pred_plane_avx) void intra_pred_plane_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_bilinear_avx FPFX(intra_pred_bilinear_avx) void intra_pred_bilinear_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_3_avx FPFX(intra_pred_ang_x_3_avx) void intra_pred_ang_x_3_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_4_avx FPFX(intra_pred_ang_x_4_avx) void intra_pred_ang_x_4_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_5_avx FPFX(intra_pred_ang_x_5_avx) void intra_pred_ang_x_5_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_6_avx FPFX(intra_pred_ang_x_6_avx) void intra_pred_ang_x_6_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_7_avx FPFX(intra_pred_ang_x_7_avx) void intra_pred_ang_x_7_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_8_avx FPFX(intra_pred_ang_x_8_avx) void intra_pred_ang_x_8_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_9_avx FPFX(intra_pred_ang_x_9_avx) void intra_pred_ang_x_9_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_10_avx FPFX(intra_pred_ang_x_10_avx) void intra_pred_ang_x_10_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_x_11_avx FPFX(intra_pred_ang_x_11_avx) void intra_pred_ang_x_11_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_13_avx FPFX(intra_pred_ang_xy_13_avx) void intra_pred_ang_xy_13_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_14_avx FPFX(intra_pred_ang_xy_14_avx) void intra_pred_ang_xy_14_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_16_avx FPFX(intra_pred_ang_xy_16_avx) void intra_pred_ang_xy_16_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_18_avx FPFX(intra_pred_ang_xy_18_avx) void intra_pred_ang_xy_18_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_20_avx FPFX(intra_pred_ang_xy_20_avx) void intra_pred_ang_xy_20_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_22_avx FPFX(intra_pred_ang_xy_22_avx) void intra_pred_ang_xy_22_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_xy_23_avx FPFX(intra_pred_ang_xy_23_avx) void intra_pred_ang_xy_23_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_25_avx FPFX(intra_pred_ang_y_25_avx) void intra_pred_ang_y_25_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_26_avx FPFX(intra_pred_ang_y_26_avx) void intra_pred_ang_y_26_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_28_avx FPFX(intra_pred_ang_y_28_avx) void intra_pred_ang_y_28_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_30_avx FPFX(intra_pred_ang_y_30_avx) void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_31_avx FPFX(intra_pred_ang_y_31_avx) void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); #define intra_pred_ang_y_32_avx FPFX(intra_pred_ang_y_32_avx) void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy); /* Function declaration defines */ #define FUNCDEF_TU(ret, name, cpu, ...) \ ret FPFX(name ## _4x4_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _8x8_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _16x16_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _32x32_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _64x64_ ## cpu(__VA_ARGS__)) #define FUNCDEF_TU_S(ret, name, cpu, ...) \ ret FPFX(name ## _4_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _8_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _16_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _32_ ## cpu(__VA_ARGS__));\ ret FPFX(name ## _64_ ## cpu(__VA_ARGS__)) #define FUNCDEF_PU(ret, name, cpu, ...) \ ret FPFX(name ## _4x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x48_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _48x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x64_ ## cpu)(__VA_ARGS__) #define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \ FUNCDEF_PU(ret, name, cpu, __VA_ARGS__);\ ret FPFX(name ## _4x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x6_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _6x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _6x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x6_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _2x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x2_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x12_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _12x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x4_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _4x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _32x48_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _48x32_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _16x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x16_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _8x64_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x8_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _64x24_ ## cpu)(__VA_ARGS__);\ ret FPFX(name ## _24x64_ ## cpu)(__VA_ARGS__); #ifdef __cplusplus } #endif #endif // #ifndef DAVS2_INTRINSIC_H davs2-1.6/source/common/vec/intrinsic_alf.cc000066400000000000000000000211351337322544400211100ustar00rootroot00000000000000/* * intrinsic_alf.cc * * Description of this file: * SSE assembly functions of ALF module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #if !HIGH_BIT_DEPTH void alf_filter_block_sse128(pel_t *p_dst, const pel_t *p_src, int stride, int lcu_pix_x, int lcu_pix_y, int lcu_width, int lcu_height, int *alf_coeff, int b_top_avail, int b_down_avail) { const pel_t *imgPad1, *imgPad2, *imgPad3, *imgPad4, *imgPad5, *imgPad6; __m128i T00, T01, T10, T11, T20, T21, T30, T31, T40, T41, T50, T51; __m128i T1, T2, T3, T4, T5, T6, T7, T8; __m128i E00, E01, E10, E11, E20, E21, E30, E31, E40, E41; __m128i C0, C1, C2, C3, C4, C30, C31, C32, C33; __m128i S0, S00, S01, S1, S10, S11, S2, S20, S21, S3, S30, S31, S4, S40, S41, S5, S50, S51, S6, S60, S61, S7, S8, SS1, SS2, S; __m128i mSwitch1, mSwitch2, mSwitch3, mSwitch4, mSwitch5; __m128i mAddOffset; __m128i mZero = _mm_set1_epi16(0); __m128i mMax = _mm_set1_epi16((short)((1 << g_bit_depth) - 1)); __m128i mask; int startPos = b_top_avail ? (lcu_pix_y - 4) : lcu_pix_y; int endPos = b_down_avail ? (lcu_pix_y + lcu_height - 4) : (lcu_pix_y + lcu_height); int xPosEnd = lcu_pix_x + lcu_width; int xPosEnd16 = xPosEnd - (lcu_width & 0x0f); int yUp, yBottom; int x, y; mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(lcu_width & 15) - 1])); p_src += (startPos * stride) + lcu_pix_x; p_dst += (startPos * stride) + lcu_pix_x; lcu_height = endPos - startPos; lcu_height--; C0 = _mm_set1_epi8((char)alf_coeff[0]); C1 = _mm_set1_epi8((char)alf_coeff[1]); C2 = _mm_set1_epi8((char)alf_coeff[2]); C3 = _mm_set1_epi8((char)alf_coeff[3]); C4 = _mm_set1_epi8((char)alf_coeff[4]); mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 2, 1, 0, 3, 0, 1, 2, 3, 2, 1, 0, 3); C30 = _mm_loadu_si128((__m128i*)&alf_coeff[5]); C31 = _mm_packs_epi32(C30, C30); C32 = _mm_packs_epi16(C31, C31); C33 = _mm_shuffle_epi8(C32, mSwitch1); mSwitch2 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, -1, 1, 2, 3, 4, 5, 6, 7, -1); mSwitch3 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, -1, 3, 4, 5, 6, 7, 8, 9, -1); mSwitch4 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, -1, 5, 6, 7, 8, 9, 10, 11, -1); mSwitch5 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, -1, 7, 8, 9, 10, 11, 12, 13, -1); mAddOffset = _mm_set1_epi16(32); for (y = 0; y <= lcu_height; y++) { yUp = DAVS2_CLIP3(0, lcu_height, y - 1); yBottom = DAVS2_CLIP3(0, lcu_height, y + 1); imgPad1 = p_src + (yBottom - y) * stride; imgPad2 = p_src + (yUp - y) * stride; yUp = DAVS2_CLIP3(0, lcu_height, y - 2); yBottom = DAVS2_CLIP3(0, lcu_height, y + 2); imgPad3 = p_src + (yBottom - y) * stride; imgPad4 = p_src + (yUp - y) * stride; yUp = DAVS2_CLIP3(0, lcu_height, y - 3); yBottom = DAVS2_CLIP3(0, lcu_height, y + 3); imgPad5 = p_src + (yBottom - y) * stride; imgPad6 = p_src + (yUp - y) * stride; // 176x144ʱVֲƥ䣬ĺƥ //for (x = lcu_pix_x; x < xPosEnd - 15; x += 16) { for (x = 0; x < lcu_width; x += 16) { T00 = _mm_loadu_si128((__m128i*)&imgPad6[x]); T01 = _mm_loadu_si128((__m128i*)&imgPad5[x]); E00 = _mm_unpacklo_epi8(T00, T01); E01 = _mm_unpackhi_epi8(T00, T01); S00 = _mm_maddubs_epi16(E00, C0);//ǰ8C0*P0Ľ S01 = _mm_maddubs_epi16(E01, C0);//8C0*P0Ľ T10 = _mm_loadu_si128((__m128i*)&imgPad4[x]); T11 = _mm_loadu_si128((__m128i*)&imgPad3[x]); E10 = _mm_unpacklo_epi8(T10, T11); E11 = _mm_unpackhi_epi8(T10, T11); S10 = _mm_maddubs_epi16(E10, C1);//ǰ8C1*P1Ľ S11 = _mm_maddubs_epi16(E11, C1);//8C1*P1Ľ T20 = _mm_loadu_si128((__m128i*)&imgPad2[x - 1]); T21 = _mm_loadu_si128((__m128i*)&imgPad1[x + 1]); E20 = _mm_unpacklo_epi8(T20, T21); E21 = _mm_unpackhi_epi8(T20, T21); S20 = _mm_maddubs_epi16(E20, C2); S21 = _mm_maddubs_epi16(E21, C2); T30 = _mm_loadu_si128((__m128i*)&imgPad2[x]); T31 = _mm_loadu_si128((__m128i*)&imgPad1[x]); E30 = _mm_unpacklo_epi8(T30, T31); E31 = _mm_unpackhi_epi8(T30, T31); S30 = _mm_maddubs_epi16(E30, C3); S31 = _mm_maddubs_epi16(E31, C3); T40 = _mm_loadu_si128((__m128i*)&imgPad2[x + 1]); T41 = _mm_loadu_si128((__m128i*)&imgPad1[x - 1]); E40 = _mm_unpacklo_epi8(T40, T41); E41 = _mm_unpackhi_epi8(T40, T41); S40 = _mm_maddubs_epi16(E40, C4); S41 = _mm_maddubs_epi16(E41, C4); T50 = _mm_loadu_si128((__m128i*)&p_src[x - 3]); T51 = _mm_loadu_si128((__m128i*)&p_src[x + 5]); T1 = _mm_shuffle_epi8(T50, mSwitch2); T2 = _mm_shuffle_epi8(T50, mSwitch3); T3 = _mm_shuffle_epi8(T50, mSwitch4); T4 = _mm_shuffle_epi8(T50, mSwitch5); T5 = _mm_shuffle_epi8(T51, mSwitch2); T6 = _mm_shuffle_epi8(T51, mSwitch3); T7 = _mm_shuffle_epi8(T51, mSwitch4); T8 = _mm_shuffle_epi8(T51, mSwitch5); S5 = _mm_maddubs_epi16(T1, C33); S6 = _mm_maddubs_epi16(T2, C33); S7 = _mm_maddubs_epi16(T3, C33); S8 = _mm_maddubs_epi16(T4, C33); S50 = _mm_hadds_epi16(S5, S6); S51 = _mm_hadds_epi16(S7, S8); S5 = _mm_hadds_epi16(S50, S51);//ǰ8 S4 = _mm_maddubs_epi16(T5, C33); S6 = _mm_maddubs_epi16(T6, C33); S7 = _mm_maddubs_epi16(T7, C33); S8 = _mm_maddubs_epi16(T8, C33); S60 = _mm_hadds_epi16(S4, S6); S61 = _mm_hadds_epi16(S7, S8); S6 = _mm_hadds_epi16(S60, S61);//8 S0 = _mm_adds_epi16(S00, S10); S1 = _mm_adds_epi16(S30, S20); S2 = _mm_adds_epi16(S40, S5); S3 = _mm_adds_epi16(S1, S0); SS1 = _mm_adds_epi16(S2, S3);//ǰ8 S0 = _mm_adds_epi16(S01, S11); S1 = _mm_adds_epi16(S31, S21); S2 = _mm_adds_epi16(S41, S6); S3 = _mm_adds_epi16(S1, S0); SS2 = _mm_adds_epi16(S2, S3);//8 SS1 = _mm_adds_epi16(SS1, mAddOffset); SS1 = _mm_srai_epi16(SS1, 6); SS1 = _mm_min_epi16(SS1, mMax); SS1 = _mm_max_epi16(SS1, mZero); SS2 = _mm_adds_epi16(SS2, mAddOffset); SS2 = _mm_srai_epi16(SS2, 6); SS2 = _mm_min_epi16(SS2, mMax); SS2 = _mm_max_epi16(SS2, mZero); S = _mm_packus_epi16(SS1, SS2); if (x != xPosEnd16) { _mm_storeu_si128((__m128i*)(p_dst + x), S); } else { _mm_maskmoveu_si128(S, mask, (char *)(p_dst + x)); break; } } p_src += stride; p_dst += stride; } } #endif // #if !HIGH_BIT_DEPTH davs2-1.6/source/common/vec/intrinsic_deblock.cc000066400000000000000000000752111337322544400217550ustar00rootroot00000000000000/* * intrinsic_deblock.cc * * Description of this file: * SSE assembly functions of Deblock module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #if !HIGH_BIT_DEPTH void deblock_edge_ver_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp = SrcPtr - 4; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i TL0l, TL1l; __m128i TR0l, TR1l; __m128i V0, V1, V2, V3, V4, V5; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS3, FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); __m128i c_16 = _mm_set1_epi16(16); T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T4 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 4)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 5)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 6)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 7)); T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_sub_epi16(FLT, c_2); T1 = _mm_sub_epi16(FLT, c_3); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 TL0l = TL0; TL1l = TL1; TR0l = TR0; TR1l = TR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0l, TR0l), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0l, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0l, 1), T2), 2); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 1), _mm_add_epi16(TL1l, TR0l)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 1), _mm_add_epi16(TR1l, TL0l)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(TL1l, 2), _mm_add_epi16(TL2, TR1l)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0l, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1l, 2), _mm_add_epi16(TR2, TL1l)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0l, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0l), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1l, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0l, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0l), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1l, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0l, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); FS = _mm_cmpeq_epi16(FS, c_4); if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */ /* cal L0/R0 */ T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0l, TL2), TR0l), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0l, TL2)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0l, TR2), TL0l), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0l, TR2)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); TL0 = _mm_blendv_epi8(TL0, V0, FS); TR0 = _mm_blendv_epi8(TR0, V1, FS); /* cal L1/R1 */ T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0l), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0l, 3), TL0l)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0l, c_8)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0l), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0l, 3), TR0l)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0l, c_8)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); TL1 = _mm_blendv_epi8(TL1, V2, FS); TR1 = _mm_blendv_epi8(TR1, V3, FS); /* cal L2/R2 */ T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2); T2 = _mm_add_epi16(_mm_slli_epi16(TL0l, 2), TR0l); V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2); T2 = _mm_add_epi16(_mm_slli_epi16(TR0l, 2), TL0l); V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); TL2 = _mm_blendv_epi8(TL2, V4, FS); TR2 = _mm_blendv_epi8(TR2, V5, FS); } /* stroe result */ T0 = _mm_packus_epi16(TL3, TR0); T1 = _mm_packus_epi16(TL2, TR1); T2 = _mm_packus_epi16(TL1, TR2); T3 = _mm_packus_epi16(TL0, TR3); T4 = _mm_unpacklo_epi8(T0, T1); T5 = _mm_unpacklo_epi8(T2, T3); T6 = _mm_unpackhi_epi8(T0, T1); T7 = _mm_unpackhi_epi8(T2, T3); V0 = _mm_unpacklo_epi16(T4, T5); V1 = _mm_unpacklo_epi16(T6, T7); V2 = _mm_unpackhi_epi16(T4, T5); V3 = _mm_unpackhi_epi16(T6, T7); T0 = _mm_unpacklo_epi32(V0, V1); T1 = _mm_unpackhi_epi32(V0, V1); T2 = _mm_unpacklo_epi32(V2, V3); T3 = _mm_unpackhi_epi32(V2, V3); pTmp = SrcPtr - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T0, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T1); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T1, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T2); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T2, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T3); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T3, 8)); } void deblock_edge_ver_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i UVL0, UVL1, UVR0, UVR1; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i P0, P1, P2, P3, P4, P5, P6, P7; __m128i V0, V1, V2, V3; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); pTmp = SrcPtrU - 4; T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); pTmp = SrcPtrV - 4; T4 = _mm_loadl_epi64((__m128i*)(pTmp)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); P0 = _mm_unpacklo_epi8(T0, T1); P1 = _mm_unpacklo_epi8(T2, T3); P2 = _mm_unpacklo_epi8(T4, T5); P3 = _mm_unpacklo_epi8(T6, T7); P4 = _mm_unpacklo_epi16(P0, P1); P5 = _mm_unpacklo_epi16(P2, P3); P6 = _mm_unpackhi_epi16(P0, P1); P7 = _mm_unpackhi_epi16(P2, P3); T0 = _mm_unpacklo_epi32(P4, P5); T1 = _mm_unpackhi_epi32(P4, P5); T2 = _mm_unpacklo_epi32(P6, P7); T3 = _mm_unpackhi_epi32(P6, P7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_sub_epi16(FLT, c_3); T1 = _mm_sub_epi16(FLT, c_4); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 UVL0 = TL0; UVL1 = TL1; UVR0 = TR0; UVR1 = TR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(UVL0, UVR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(UVR0, 1), T2), 2); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 1), _mm_add_epi16(UVL1, UVR0)); T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 1), _mm_add_epi16(UVR1, UVL0)); T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_2)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T3 = _mm_slli_epi16(T3, 1); T0 = _mm_add_epi16(_mm_slli_epi16(UVL1, 2), _mm_add_epi16(TL2, UVR1)); T0 = _mm_add_epi16(_mm_slli_epi16(UVL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(UVR1, 2), _mm_add_epi16(TR2, UVL1)); T0 = _mm_add_epi16(_mm_slli_epi16(UVR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_3)); TR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, UVR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, UVL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(UVR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1 = _mm_blendv_epi8(TL1, V2, _mm_cmpeq_epi16(FS, c_3)); TR1 = _mm_blendv_epi8(TR1, V3, _mm_cmpeq_epi16(FS, c_3)); /* store result */ T0 = _mm_packus_epi16(TL3, TR0); T1 = _mm_packus_epi16(TL2, TR1); T2 = _mm_packus_epi16(TL1, TR2); T3 = _mm_packus_epi16(TL0, TR3); P0 = _mm_unpacklo_epi8(T0, T1); P1 = _mm_unpacklo_epi8(T2, T3); P2 = _mm_unpackhi_epi8(T0, T1); P3 = _mm_unpackhi_epi8(T2, T3); P4 = _mm_unpacklo_epi16(P0, P1); P5 = _mm_unpacklo_epi16(P2, P3); P6 = _mm_unpackhi_epi16(P0, P1); P7 = _mm_unpackhi_epi16(P2, P3); T0 = _mm_unpacklo_epi32(P4, P5); T1 = _mm_unpackhi_epi32(P4, P5); T2 = _mm_unpacklo_epi32(P6, P7); T3 = _mm_unpackhi_epi32(P6, P7); pTmp = SrcPtrU - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T0, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T1); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T1, 8)); pTmp = SrcPtrV - 4; _mm_storel_epi64((__m128i*)(pTmp), T2); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T2, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T3); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T3, 8)); } void deblock_edge_hor_sse128(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i TL0w, TL1w, TL2w, TR0w, TR1w, TR2w; //for write __m128i V0, V1, V2, V3, V4, V5; __m128i T0, T1, T2; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS3, FS4, FS56; __m128i ALPHA = _mm_set1_epi16((short)Alpha); __m128i BETA = _mm_set1_epi16((short)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); __m128i c_16 = _mm_set1_epi16(16); TL2 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc3)); TL1 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc2)); TL0 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc)); TR0 = _mm_loadl_epi64((__m128i*)(SrcPtr + 0)); TR1 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc)); TR2 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc2)); TL2 = _mm_unpacklo_epi8(TL2, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL0 = _mm_unpacklo_epi8(TL0, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_subs_epi16(FLT, c_2); T1 = _mm_subs_epi16(FLT, c_3); T2 = _mm_subabs_epu16(TL1, TR1); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_1, c_2, _mm_cmpeq_epi16(FLT_L, c_2)); FS3 = _mm_blendv_epi8(c_0, c_1, _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, c_3)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 TR0w = TR0; TR1w = TR1; TL0w = TL0; TL1w = TL1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); TL0w = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); TR0w = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_2)); TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); TL0w = _mm_blendv_epi8(TL0w, V0, _mm_cmpeq_epi16(FS, c_3)); TR0w = _mm_blendv_epi8(TR0w, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); TL1w = _mm_blendv_epi8(TL1w, V2, _mm_cmpeq_epi16(FS, c_3)); TR1w = _mm_blendv_epi8(TR1w, V3, _mm_cmpeq_epi16(FS, c_3)); FS = _mm_cmpeq_epi16(FS, c_4); if (!_mm_testz_si128(FS, _mm_set1_epi16(-1))) { /* fs == 4 */ /* cal L0/R0 */ T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TL0, TL2), TR0), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TL0, TL2)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), _mm_slli_epi16(TR2, 2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); T0 = _mm_slli_epi16(_mm_add_epi16(_mm_add_epi16(TR0, TR2), TL0), 3); T0 = _mm_add_epi16(_mm_add_epi16(T0, c_16), _mm_add_epi16(TR0, TR2)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), _mm_slli_epi16(TL2, 2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 5); TL0w = _mm_blendv_epi8(TL0w, V0, FS); TR0w = _mm_blendv_epi8(TR0w, V1, FS); /* cal L1/R1 */ T0 = _mm_slli_epi16(_mm_add_epi16(TL2, TR0), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TL0, 3), TL0)); T2 = _mm_add_epi16(_mm_slli_epi16(TL2, 2), _mm_add_epi16(TR0, c_8)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); T0 = _mm_slli_epi16(_mm_add_epi16(TR2, TL0), 1); T0 = _mm_add_epi16(T0, _mm_sub_epi16(_mm_slli_epi16(TR0, 3), TR0)); T2 = _mm_add_epi16(_mm_slli_epi16(TR2, 2), _mm_add_epi16(TL0, c_8)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, T2), 4); TL1w = _mm_blendv_epi8(TL1w, V2, FS); TR1w = _mm_blendv_epi8(TR1w, V3, FS); /* cal L2/R2 */ T0 = _mm_add_epi16(_mm_slli_epi16(TL2, 1), TL2); T2 = _mm_add_epi16(_mm_slli_epi16(TL0, 2), TR0); V4 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); T0 = _mm_add_epi16(_mm_slli_epi16(TR2, 1), TR2); T2 = _mm_add_epi16(_mm_slli_epi16(TR0, 2), TL0); V5 = _mm_srli_epi16(_mm_add_epi16(T0, _mm_add_epi16(T2, c_4)), 3); TL2w = _mm_blendv_epi8(TL2, V4, FS); TR2w = _mm_blendv_epi8(TR2, V5, FS); /* stroe result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm_packus_epi16(TL0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm_packus_epi16(TR0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm_packus_epi16(TL1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm_packus_epi16(TR1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc3), _mm_packus_epi16(TL2w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc2), _mm_packus_epi16(TR2w, c_0)); } else { /* stroe result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm_packus_epi16(TL0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm_packus_epi16(TR0w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm_packus_epi16(TL1w, c_0)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm_packus_epi16(TR1w, c_0)); } } void deblock_edge_hor_c_sse128(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i UL0, UL1, UR0, UR1; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i T0, T1, T2; __m128i V0, V1, V2, V3; __m128i M0, M1, M2; __m128i FLT_L, FLT_R, FLT, FS; __m128i FS4, FS56; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m128i c_1 = _mm_set1_epi16(1); __m128i c_2 = _mm_set1_epi16(2); __m128i c_3 = _mm_set1_epi16(3); __m128i c_4 = _mm_set1_epi16(4); __m128i c_8 = _mm_set1_epi16(8); TL0 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc))[0], ((int32_t*)(SrcPtrU - inc))[0]); TL1 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc2))[0], ((int32_t*)(SrcPtrU - inc2))[0]); TL2 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV - inc3))[0], ((int32_t*)(SrcPtrU - inc3))[0]); TR0 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV))[0], ((int32_t*)(SrcPtrU))[0]); TR1 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc))[0], ((int32_t*)(SrcPtrU + inc))[0]); TR2 = _mm_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc2))[0], ((int32_t*)(SrcPtrU + inc2))[0]); TL0 = _mm_unpacklo_epi8(TL0, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL2 = _mm_unpacklo_epi8(TL2, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); #define _mm_subabs_epu16(a, b) _mm_abs_epi16(_mm_subs_epi16(a, b)) T0 = _mm_subabs_epu16(TL0, TR0); T1 = _mm_cmpgt_epi16(T0, c_1); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0 = _mm_subabs_epu16(TL1, TL0); T1 = _mm_subabs_epu16(TR1, TR0); FLT_L = _mm_and_si128(_mm_cmpgt_epi16(BETA, T0), c_2); FLT_R = _mm_and_si128(_mm_cmpgt_epi16(BETA, T1), c_2); T0 = _mm_subabs_epu16(TL2, TL0); T1 = _mm_subabs_epu16(TR2, TR0); M1 = _mm_cmpgt_epi16(BETA, T0); M2 = _mm_cmpgt_epi16(BETA, T1); FLT_L = _mm_add_epi16(_mm_and_si128(M1, c_1), FLT_L); FLT_R = _mm_add_epi16(_mm_and_si128(M2, c_1), FLT_R); FLT = _mm_add_epi16(FLT_L, FLT_R); M1 = _mm_and_si128(_mm_cmpeq_epi16(TR0, TR1), _mm_cmpeq_epi16(TL0, TL1)); T0 = _mm_subs_epi16(FLT, c_3); T1 = _mm_subs_epi16(FLT, c_4); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, c_1, _mm_cmpeq_epi16(FLT_L, c_2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, c_4)); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, c_4)); FS = _mm_and_si128(FS, M0); #undef _mm_subabs_epu16 UR0 = TR0; //UR0 TR0 to store UR1 = TR1; UL0 = TL0; UL1 = TL1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), c_2); // L0 + R0 + 2 V0 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TL0, 1), T2), 2); V1 = _mm_srli_epi16(_mm_add_epi16(_mm_slli_epi16(TR0, 1), T2), 2); UL0 = _mm_blendv_epi8(TL0, V0, _mm_cmpeq_epi16(FS, c_1)); UR0 = _mm_blendv_epi8(TR0, V1, _mm_cmpeq_epi16(FS, c_1)); /* fs == 2 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 1) + (R0 << 1) + 4 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 1), _mm_add_epi16(TL1, TR0)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 3), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 1), _mm_add_epi16(TR1, TL0)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 3), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(_mm_add_epi16(T0, c_4), 4); UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_2)); UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_2)); /* fs == 3 */ T2 = _mm_slli_epi16(T2, 1); // (L0 << 2) + (R0 << 2) + 8 T0 = _mm_add_epi16(_mm_slli_epi16(TL1, 2), _mm_add_epi16(TL2, TR1)); T0 = _mm_add_epi16(_mm_slli_epi16(TL0, 1), _mm_add_epi16(T0, T2)); V0 = _mm_srli_epi16(T0, 4); T0 = _mm_add_epi16(_mm_slli_epi16(TR1, 2), _mm_add_epi16(TR2, TL1)); T0 = _mm_add_epi16(_mm_slli_epi16(TR0, 1), _mm_add_epi16(T0, T2)); V1 = _mm_srli_epi16(T0, 4); UL0 = _mm_blendv_epi8(UL0, V0, _mm_cmpeq_epi16(FS, c_3)); UR0 = _mm_blendv_epi8(UR0, V1, _mm_cmpeq_epi16(FS, c_3)); T0 = _mm_add_epi16(_mm_add_epi16(TL2, TR0), _mm_slli_epi16(TL2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TL0, 2)); V2 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); T0 = _mm_add_epi16(_mm_add_epi16(TR2, TL0), _mm_slli_epi16(TR2, 1)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR1, 3)); T0 = _mm_add_epi16(T0, _mm_slli_epi16(TR0, 2)); V3 = _mm_srli_epi16(_mm_add_epi16(T0, c_8), 4); UL1 = _mm_blendv_epi8(UL1, V2, _mm_cmpeq_epi16(FS, c_3)); UR1 = _mm_blendv_epi8(UR1, V3, _mm_cmpeq_epi16(FS, c_3)); /* store result */ UL0 = _mm_packus_epi16(UL0, c_0); UL1 = _mm_packus_epi16(UL1, c_0); UR0 = _mm_packus_epi16(UR0, c_0); UR1 = _mm_packus_epi16(UR1, c_0); ((int32_t*)(SrcPtrU - inc ))[0] = M128_I32(UL0, 0); ((int32_t*)(SrcPtrU ))[0] = M128_I32(UR0, 0); ((int32_t*)(SrcPtrU - inc2))[0] = M128_I32(UL1, 0); ((int32_t*)(SrcPtrU + inc ))[0] = M128_I32(UR1, 0); ((int32_t*)(SrcPtrV - inc ))[0] = M128_I32(UL0, 1); ((int32_t*)(SrcPtrV ))[0] = M128_I32(UR0, 1); ((int32_t*)(SrcPtrV - inc2))[0] = M128_I32(UL1, 1); ((int32_t*)(SrcPtrV + inc ))[0] = M128_I32(UR1, 1); } #endif // #if !HIGH_BIT_DEPTH davs2-1.6/source/common/vec/intrinsic_deblock_avx2.cc000066400000000000000000001010461337322544400227110ustar00rootroot00000000000000/* * intrinsic_deblock_avx2.cc * * Description of this file: * AVX2 assembly functions of Deblock module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include #if !HIGH_BIT_DEPTH __m128i c_0_128; __m256i c_f; __m256i c_0; __m256i c_1; __m256i c_2; __m256i c_3; __m256i c_4; __m256i c_8; __m256i c_16; /*----------------------avx2-----------------------------------*/ void deblock_edge_ver_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp = SrcPtr - 4; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1; __m128i FLT, FS; __m128i FS3, FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1, TRL2; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_LR; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i BETA = _mm_set1_epi16((pel_t)Beta); __m128i c_0 = _mm_set1_epi16(0); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i c_16_256 = _mm256_set1_epi16(16); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T4 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 4)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 5)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 6)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 7)); //--------------- transpose ------------------------------- T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); /* TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(T4), T6, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(T5), T7, 1); TLR0w = _mm256_unpacklo_epi32(TLR0, TLR1); //T0 T2 TLR1w = _mm256_unpackhi_epi32(TLR0, TLR1); //T1 T3 TLR3 = _mm256_unpacklo_epi8(TLR0w, c_0_256); //TL3 TR0 TLR2 = _mm256_unpackhi_epi8(TLR0w, c_0_256); //TL2 TR1 TLR1 = _mm256_unpacklo_epi8(TLR1w, c_0_256); //TL1 TR2 TLR0 = _mm256_unpackhi_epi8(TLR1w, c_0_256); //TL0 TR3 TR0 = _mm256_extracti128_si256(TLR3, 0x01); TR1 = _mm256_extracti128_si256(TLR2, 0x01); TR2 = _mm256_extracti128_si256(TLR1, 0x01); TR3 = _mm256_extracti128_si256(TLR0, 0x01); TLR0 = _mm256_inserti128_si256(TLR0, TR0, 1); TLR1 = _mm256_inserti128_si256(TLR1, TR1, 1); TLR2 = _mm256_inserti128_si256(TLR2, TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), _mm256_castsi256_si128(TLR0), 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), _mm256_castsi256_si128(TLR1), 1); */ T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(TL0, TR0)); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_LR = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T1_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T2_256 = _mm256_cmpgt_epi16(BETA_256, T1_256); FLT_LR = _mm256_add_epi16(_mm256_and_si256(T2_256, c_1_256), FLT_LR); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_LR), _mm256_extracti128_si256(FLT_LR, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_2_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T2 = _mm_abs_epi16(_mm_subs_epi16(TL1, TR1)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(_mm256_castsi256_si128(c_1_256), _mm256_castsi256_si128(c_2_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_LR), _mm256_castsi256_si128(c_2_256))); FS3 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_3_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); FS = _mm_cmpeq_epi16(FS, _mm256_castsi256_si128(c_4_256)); if (_mm_extract_epi64(FS, 0) || _mm_extract_epi64(FS, 1)) { /* fs == 4 */ TRL2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR2), TL2, 1); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); /* cal L0/R0 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(_mm256_add_epi16(TLR0, TLR2), TRL0), 3); T0_256 = _mm256_add_epi16(_mm256_add_epi16(T0_256, c_16_256), _mm256_add_epi16(TLR0, TLR2)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TRL2, 1), _mm256_slli_epi16(TRL2, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 5); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, FS_256); /* cal L1/R1 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(TLR2, TRL0), 1); T0_256 = _mm256_add_epi16(T0_256, _mm256_sub_epi16(_mm256_slli_epi16(TLR0, 3), TLR0)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 2), _mm256_add_epi16(TRL0, c_8_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, FS_256); /* cal L2/R2 */ T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 1), TLR2); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 2), TRL0); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, _mm256_add_epi16(T2_256, c_4_256)), 3); TLR2 = _mm256_blendv_epi8(TLR2, T1_256, FS_256); } /* stroe result */ T4 = _mm_packus_epi16(TL3, _mm256_extracti128_si256(TLR0w, 0x01)); T5 = _mm_packus_epi16(_mm256_castsi256_si128(TLR2), _mm256_extracti128_si256(TLR1w, 0x01)); T6 = _mm_packus_epi16(_mm256_castsi256_si128(TLR1w), _mm256_extracti128_si256(TLR2, 0x01)); T7 = _mm_packus_epi16(_mm256_castsi256_si128(TLR0w), TR3); T0 = _mm_unpacklo_epi8(T4, T5); T1 = _mm_unpacklo_epi8(T6, T7); T2 = _mm_unpackhi_epi8(T4, T5); T3 = _mm_unpackhi_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); pTmp = SrcPtr - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T0, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T1); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T1, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T2); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T2, 8)); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), T3); pTmp += stride; _mm_storel_epi64((__m128i*)(pTmp), _mm_srli_si128(T3, 8)); } void deblock_edge_ver_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { pel_t *pTmp; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2, TL3; __m128i TR0, TR1, TR2, TR3; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i M0, M1; __m128i FLT, FS; __m128i FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((pel_t)Alpha); __m128i c_0 = _mm_set1_epi16(0); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); pTmp = SrcPtrU - 4; T0 = _mm_loadl_epi64((__m128i*)(pTmp)); T1 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T2 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T3 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); pTmp = SrcPtrV - 4; T4 = _mm_loadl_epi64((__m128i*)(pTmp)); T5 = _mm_loadl_epi64((__m128i*)(pTmp + stride)); T6 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 2)); T7 = _mm_loadl_epi64((__m128i*)(pTmp + stride * 3)); T0 = _mm_unpacklo_epi8(T0, T1); T1 = _mm_unpacklo_epi8(T2, T3); T2 = _mm_unpacklo_epi8(T4, T5); T3 = _mm_unpacklo_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); TL3 = _mm_unpacklo_epi8(T0, c_0); TL2 = _mm_unpackhi_epi8(T0, c_0); TL1 = _mm_unpacklo_epi8(T1, c_0); TL0 = _mm_unpackhi_epi8(T1, c_0); TR0 = _mm_unpacklo_epi8(T2, c_0); TR1 = _mm_unpackhi_epi8(T2, c_0); TR2 = _mm_unpacklo_epi8(T3, c_0); TR3 = _mm_unpackhi_epi8(T3, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0))); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_4_256)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0)), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); /* stroe result */ T4 = _mm_packus_epi16(TL3, _mm256_extracti128_si256(TLR0w, 0x01)); T5 = _mm_packus_epi16(TL2, _mm256_extracti128_si256(TLR1w, 0x01)); T6 = _mm_packus_epi16(_mm256_castsi256_si128(TLR1w), TR2); T7 = _mm_packus_epi16(_mm256_castsi256_si128(TLR0w), TR3); T0 = _mm_unpacklo_epi8(T4, T5); T1 = _mm_unpacklo_epi8(T6, T7); T2 = _mm_unpackhi_epi8(T4, T5); T3 = _mm_unpackhi_epi8(T6, T7); T4 = _mm_unpacklo_epi16(T0, T1); T5 = _mm_unpacklo_epi16(T2, T3); T6 = _mm_unpackhi_epi16(T0, T1); T7 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(T4, T5); T1 = _mm_unpackhi_epi32(T4, T5); T2 = _mm_unpacklo_epi32(T6, T7); T3 = _mm_unpackhi_epi32(T6, T7); pTmp = SrcPtrU - 4; _mm_storel_epi64((__m128i*)(pTmp), T0); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T0, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T1); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T1, 8)); pTmp = SrcPtrV - 4; _mm_storel_epi64((__m128i*)(pTmp), T2); _mm_storel_epi64((__m128i*)(pTmp + stride), _mm_srli_si128(T2, 8)); _mm_storel_epi64((__m128i*)(pTmp + (stride << 1)), T3); _mm_storel_epi64((__m128i*)(pTmp + stride * 3), _mm_srli_si128(T3, 8)); } void deblock_edge_hor_avx2(pel_t *SrcPtr, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i TL0, TL1, TL2; __m128i TR0, TR1, TR2; __m128i T0, T1, T2; __m128i M0, M1; __m128i FLT, FS; __m128i FS3, FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1, TRL2; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((short)Alpha); __m128i BETA = _mm_set1_epi16((short)Beta); __m128i c_0 = _mm_set1_epi16(0); __m256i c_0_256 = _mm256_setzero_si256(); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i c_16_256 = _mm256_set1_epi16(16); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); TL2 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc3)); TL1 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc2)); TL0 = _mm_loadl_epi64((__m128i*)(SrcPtr - inc)); TR0 = _mm_loadl_epi64((__m128i*)(SrcPtr + 0)); TR1 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc)); TR2 = _mm_loadl_epi64((__m128i*)(SrcPtr + inc2)); TL2 = _mm_unpacklo_epi8(TL2, c_0); TL1 = _mm_unpacklo_epi8(TL1, c_0); TL0 = _mm_unpacklo_epi8(TL0, c_0); TR0 = _mm_unpacklo_epi8(TR0, c_0); TR1 = _mm_unpacklo_epi8(TR1, c_0); TR2 = _mm_unpacklo_epi8(TR2, c_0); TLR0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL0), TR0, 1); TLR1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL1), TR1, 1); TLR2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TL2), TR2, 1); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR0), TL0, 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR1), TL1, 1); T0 = _mm_abs_epi16(_mm_subs_epi16(TL0, TR0)); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag1, flag0, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_2_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T2 = _mm_abs_epi16(_mm_subs_epi16(TL1, TR1)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(_mm256_castsi256_si128(c_1_256), _mm256_castsi256_si128(c_2_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS3 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpgt_epi16(BETA, T2)); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS3, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_3_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(TL0, TR0), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); FS = _mm_cmpeq_epi16(FS, _mm256_castsi256_si128(c_4_256)); if (_mm_extract_epi64(FS, 0) || _mm_extract_epi64(FS, 1)) { /* fs == 4 */ TRL2 = _mm256_inserti128_si256(_mm256_castsi128_si256(TR2), TL2, 1); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); /* cal L0/R0 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(_mm256_add_epi16(TLR0, TLR2), TRL0), 3); T0_256 = _mm256_add_epi16(_mm256_add_epi16(T0_256, c_16_256), _mm256_add_epi16(TLR0, TLR2)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TRL2, 1), _mm256_slli_epi16(TRL2, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 5); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, FS_256); /* cal L1/R1 */ T0_256 = _mm256_slli_epi16(_mm256_add_epi16(TLR2, TRL0), 1); T0_256 = _mm256_add_epi16(T0_256, _mm256_sub_epi16(_mm256_slli_epi16(TLR0, 3), TLR0)); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 2), _mm256_add_epi16(TRL0, c_8_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, T2_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, FS_256); /* cal L2/R2 */ T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR2, 1), TLR2); T2_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 2), TRL0); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, _mm256_add_epi16(T2_256, c_4_256)), 3); TLR2 = _mm256_blendv_epi8(TLR2, T1_256, FS_256); TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); TLR2 = _mm256_packus_epi16(TLR2, c_0_256); /* stroe result */ _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm256_castsi256_si128(TLR0w)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm256_extracti128_si256(TLR0w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm256_castsi256_si128(TLR1w)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm256_extracti128_si256(TLR1w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc3), _mm256_castsi256_si128(TLR2)); _mm_storel_epi64((__m128i*)(SrcPtr + inc2), _mm256_extracti128_si256(TLR2, 0x01)); } else { /* stroe result */ TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); _mm_storel_epi64((__m128i*)(SrcPtr - inc), _mm256_castsi256_si128(TLR0w)); _mm_storel_epi64((__m128i*)(SrcPtr - 0), _mm256_extracti128_si256(TLR0w, 0x01)); _mm_storel_epi64((__m128i*)(SrcPtr - inc2), _mm256_castsi256_si128(TLR1w)); _mm_storel_epi64((__m128i*)(SrcPtr + inc), _mm256_extracti128_si256(TLR1w, 0x01)); } } //Ҫ޸ı ޸ı i32s_tΪint32_t;signed int void deblock_edge_hor_c_avx2(pel_t *SrcPtrU, pel_t *SrcPtrV, int stride, int Alpha, int Beta, uint8_t *flt_flag) { int inc = stride; int inc2 = inc << 1; int inc3 = inc + inc2; int flag0 = flt_flag[0] ? -1 : 0; int flag1 = flt_flag[1] ? -1 : 0; __m128i T0, T1, T2; __m128i M0, M1; __m128i FLT, FS; __m128i FS4, FS56; __m256i TLR0, TLR1, TLR2; // store TL* and TR* __m256i TRL0, TRL1; // store TR* and TL* __m256i T0_256, T1_256, T2_256; __m256i FLT_X; __m256i TLR0w, TLR1w; __m256i FS_256; __m128i ALPHA = _mm_set1_epi16((short)Alpha); __m128i c_0 = _mm_set1_epi16(0); __m256i c_0_256 = _mm256_setzero_si256(); __m256i c_1_256 = _mm256_set1_epi16(1); __m256i c_2_256 = _mm256_set1_epi16(2); __m256i c_3_256 = _mm256_set1_epi16(3); __m256i c_4_256 = _mm256_set1_epi16(4); __m256i c_8_256 = _mm256_set1_epi16(8); __m256i BETA_256 = _mm256_set1_epi16((short)Beta); __m256i mask0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); __m256i mask1 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, 0); __m256i mask4 = _mm256_set_epi32(0, 0, 0, -1, 0, 0, 0, 0); __m256i mask5 = _mm256_set_epi32(0, 0, -1, 0, 0, 0, 0, 0); TLR0 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV))[0], ((int32_t*)(SrcPtrU))[0], 0, 0, ((int32_t*)(SrcPtrV - inc))[0], ((int32_t*)(SrcPtrU - inc))[0]); TLR1 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc))[0], ((int32_t*)(SrcPtrU + inc))[0], 0, 0, ((int32_t*)(SrcPtrV - inc2))[0], ((int32_t*)(SrcPtrU - inc2))[0]); TLR2 = _mm256_set_epi32(0, 0, ((int32_t*)(SrcPtrV + inc2))[0], ((int32_t*)(SrcPtrU + inc2))[0], 0, 0, ((int32_t*)(SrcPtrV - inc3))[0], ((int32_t*)(SrcPtrU - inc3))[0]); TLR0 = _mm256_unpacklo_epi8(TLR0, c_0_256); TLR1 = _mm256_unpacklo_epi8(TLR1, c_0_256); TLR2 = _mm256_unpacklo_epi8(TLR2, c_0_256); TRL0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(TLR0, 0x01)), _mm256_castsi256_si128(TLR0), 1); TRL1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm256_extracti128_si256(TLR1, 0x01)), _mm256_castsi256_si128(TLR1), 1); T0 = _mm_abs_epi16(_mm_subs_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0))); T1 = _mm_cmpgt_epi16(T0, _mm256_castsi256_si128(c_1_256)); T2 = _mm_cmpgt_epi16(ALPHA, T0); M0 = _mm_set_epi32(flag1, flag0, flag1, flag0); M0 = _mm_and_si128(M0, _mm_and_si128(T1, T2)); // mask1 T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR1, TLR0)); FLT_X = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_2_256); T0_256 = _mm256_abs_epi16(_mm256_subs_epi16(TLR2, TLR0)); T1_256 = _mm256_and_si256(_mm256_cmpgt_epi16(BETA_256, T0_256), c_1_256); FLT_X = _mm256_add_epi16(T1_256, FLT_X); FLT = _mm_add_epi16(_mm256_castsi256_si128(FLT_X), _mm256_extracti128_si256(FLT_X, 0x01)); T0_256 = _mm256_cmpeq_epi16(TLR1, TLR0); M1 = _mm_and_si128(_mm256_castsi256_si128(T0_256), _mm256_extracti128_si256(T0_256, 0x01)); T0 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_3_256)); T1 = _mm_subs_epi16(FLT, _mm256_castsi256_si128(c_4_256)); FS56 = _mm_blendv_epi8(T1, T0, M1); FS4 = _mm_blendv_epi8(c_0, _mm256_castsi256_si128(c_1_256), _mm_cmpeq_epi16(_mm256_castsi256_si128(FLT_X), _mm256_castsi256_si128(c_2_256))); FS = _mm_blendv_epi8(c_0, FS56, _mm_cmpgt_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_blendv_epi8(FS, FS4, _mm_cmpeq_epi16(FLT, _mm256_castsi256_si128(c_4_256))); FS = _mm_and_si128(FS, M0); FS_256 = _mm256_inserti128_si256(_mm256_castsi128_si256(FS), FS, 1); TLR0w = TLR0; TLR1w = TLR1; /* fs == 1 */ T2 = _mm_add_epi16(_mm_add_epi16(_mm256_castsi256_si128(TLR0), _mm256_castsi256_si128(TRL0)), _mm256_castsi256_si128(c_2_256)); // L0 + R0 + 2 T2_256 = _mm256_castsi128_si256(T2); T2_256 = _mm256_inserti128_si256(T2_256, T2, 1); // save T1_256 = _mm256_srli_epi16(_mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), T2_256), 2); TLR0w = _mm256_blendv_epi8(TLR0, T1_256, _mm256_cmpeq_epi16(FS_256, c_1_256)); /* fs == 2 */ T2_256 = _mm256_slli_epi16(T2_256, 1); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 1), _mm256_add_epi16(TLR1, TRL0)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 3), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_4_256), 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_2_256)); /* fs == 3 */ T2_256 = _mm256_slli_epi16(T2_256, 1); // (L0 << 2) + (R0 << 2) + 8 T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR1, 2), _mm256_add_epi16(TLR2, TRL1)); T0_256 = _mm256_add_epi16(_mm256_slli_epi16(TLR0, 1), _mm256_add_epi16(T0_256, T2_256)); T1_256 = _mm256_srli_epi16(T0_256, 4); TLR0w = _mm256_blendv_epi8(TLR0w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); T0_256 = _mm256_add_epi16(_mm256_add_epi16(TLR2, TRL0), _mm256_slli_epi16(TLR2, 1)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR1, 3)); T0_256 = _mm256_add_epi16(T0_256, _mm256_slli_epi16(TLR0, 2)); T1_256 = _mm256_srli_epi16(_mm256_add_epi16(T0_256, c_8_256), 4); TLR1w = _mm256_blendv_epi8(TLR1w, T1_256, _mm256_cmpeq_epi16(FS_256, c_3_256)); /* store result */ TLR0w = _mm256_packus_epi16(TLR0w, c_0_256); TLR1w = _mm256_packus_epi16(TLR1w, c_0_256); _mm256_maskstore_epi32(((int32_t*)(SrcPtrU - inc )), mask0, TLR0w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrU - 16)), mask4, TLR0w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrU - inc2)), mask0, TLR1w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrU + inc - 16)), mask4, TLR1w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrV - inc - 4)), mask1, TLR0w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrV - 20)), mask5, TLR0w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrV - inc2 - 4)), mask1, TLR1w); _mm256_maskstore_epi32(((int32_t*)(SrcPtrV + inc - 20)), mask5, TLR1w); } #endif davs2-1.6/source/common/vec/intrinsic_idct.cc000066400000000000000000020152561337322544400213020ustar00rootroot00000000000000/* * intrinsic_idct.cc * * Description of this file: * SSE assembly functions of IDCT module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include ALIGN32(static const coeff_t tab_idct_8x8[12][8]) = { { 44, 38, 44, 38, 44, 38, 44, 38 }, { 25, 9, 25, 9, 25, 9, 25, 9 }, { 38, -9, 38, -9, 38, -9, 38, -9 }, { -44, -25, -44, -25, -44, -25, -44, -25 }, { 25, -44, 25, -44, 25, -44, 25, -44 }, { 9, 38, 9, 38, 9, 38, 9, 38 }, { 9, -25, 9, -25, 9, -25, 9, -25 }, { 38, -44, 38, -44, 38, -44, 38, -44 }, { 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42 } }; /* --------------------------------------------------------------------------- */ ALIGN16(static const int16_t g_2T[SEC_TR_SIZE * SEC_TR_SIZE]) = { 123, -35, -8, -3, -32, -120, 30, 10, 14, 25, 123, -22, 8, 13, 19, 126 }; /* --------------------------------------------------------------------------- */ ALIGN16(static const int16_t g_2T_C[SEC_TR_SIZE * SEC_TR_SIZE]) = { 34, 58, 72, 81, 77, 69, -7, -75, 79, -33, -75, 58, 55, -84, 73, -28 }; /* --------------------------------------------------------------------------- */ void idct_4x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 __m128i S0, S1; __m128i T0, T1; __m128i E0, E1, O0, O1; S0 = _mm_loadu_si128((__m128i*)(src )); S1 = _mm_loadu_si128((__m128i*)(src+ 8)); T0 = _mm_unpacklo_epi16(S0, S1); E0 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E1 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(S0, S1); O0 = _mm_madd_epi16(T1, c16_p17_p42); O1 = _mm_madd_epi16(T1, c16_n42_p17); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0, O0), shift1), _mm_srai_epi32(_mm_sub_epi32(E1, O1), shift1)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1, O1), shift1), _mm_srai_epi32(_mm_sub_epi32(E0, O0), shift1)); /* inverse */ T0 = _mm_unpacklo_epi16(S0, S1); T1 = _mm_unpackhi_epi16(S0, S1); S0 = _mm_unpacklo_epi32(T0, T1); S1 = _mm_unpackhi_epi32(T0, T1); /* second pass ------------------------------------------------- */ c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 T0 = _mm_unpacklo_epi16(S0, S1); E0 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E1 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(S0, S1); O0 = _mm_madd_epi16(T1, c16_p17_p42); O1 = _mm_madd_epi16(T1, c16_n42_p17); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0, O0), shift2), _mm_srai_epi32(_mm_sub_epi32(E1, O1), shift2)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1, O1), shift2), _mm_srai_epi32(_mm_sub_epi32(E0, O0), shift2)); T0 = _mm_unpacklo_epi16(S0, S1); T1 = _mm_unpackhi_epi16(S0, S1); S0 = _mm_unpacklo_epi32(T0, T1); S1 = _mm_unpackhi_epi32(T0, T1); // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); S0 = _mm_max_epi16(_mm_min_epi16(S0, max_val), min_val); S1 = _mm_max_epi16(_mm_min_epi16(S1, max_val), min_val); } // store if (i_dst == 4) { _mm_store_si128((__m128i*)(dst + 0), S0); _mm_store_si128((__m128i*)(dst + 8), S1); } else { _mm_storel_epi64((__m128i*)(dst + 0 * i_dst), S0); _mm_storeh_pi((__m64 *)(dst + 1 * i_dst), _mm_castsi128_ps(S0)); _mm_storel_epi64((__m128i*)(dst + 2 * i_dst), S1); _mm_storeh_pi((__m64 *)(dst + 3 * i_dst), _mm_castsi128_ps(S1)); } } /* --------------------------------------------------------------------------- */ void idct_4x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 // DCT1 __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i res00, res01, res02, res03, res04, res05, res06, res07; in00 = _mm_loadu_si128((const __m128i*)&src[ 0 * 4]); // [07 06 05 04 03 02 01 00] in01 = _mm_loadu_si128((const __m128i*)&src[ 2 * 4]); // [27 26 25 24 23 22 21 20] in02 = _mm_loadu_si128((const __m128i*)&src[ 4 * 4]); // [47 46 45 44 43 42 41 40] in03 = _mm_loadu_si128((const __m128i*)&src[ 6 * 4]); // [67 66 65 64 63 62 61 60] in04 = _mm_loadu_si128((const __m128i*)&src[ 8 * 4]); in05 = _mm_loadu_si128((const __m128i*)&src[10 * 4]); in06 = _mm_loadu_si128((const __m128i*)&src[12 * 4]); in07 = _mm_loadu_si128((const __m128i*)&src[14 * 4]); { const __m128i T_00_00A = _mm_unpackhi_epi16(in00, in01); // [33 13 32 12 31 11 30 10] const __m128i T_00_01A = _mm_unpackhi_epi16(in02, in03); // [ ] const __m128i T_00_02A = _mm_unpackhi_epi16(in04, in05); // [ ] const __m128i T_00_03A = _mm_unpackhi_epi16(in06, in07); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in01, in03); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in05, in07); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in02, in06); // [ ]row const __m128i T_00_07A = _mm_unpacklo_epi16(in00, in04); // [83 03 82 02 81 01 81 00] row08 row00 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EEO0A, EEO1A; __m128i EEE0A, EEE1A; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ row = _mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)), \ _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315))); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), shift1); // E0 + O0 + rnd [30 20 10 00] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), shift1); // E1 + O1 + rnd [31 21 11 01] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), shift1); // E2 + O2 + rnd [32 22 12 02] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), shift1); // E3 + O3 + rnd [33 23 13 03] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), shift1); // E4 [33 24 14 04] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), shift1); // E5 [35 25 15 05] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), shift1); // E6 [36 26 16 06] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), shift1); // E7 [37 27 17 07] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), shift1); // E7 [30 20 10 00] x8 const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), shift1); // E6 [31 21 11 01] x9 const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), shift1); // E5 [32 22 12 02] xA const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), shift1); // E4 [33 23 13 03] xB const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), shift1); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), shift1); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), shift1); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), shift1); // E0 - O0 + rnd [37 27 17 07] xF res00 = _mm_packs_epi32(T30A, T38A); res01 = _mm_packs_epi32(T31A, T39A); res02 = _mm_packs_epi32(T32A, T3AA); res03 = _mm_packs_epi32(T33A, T3BA); res04 = _mm_packs_epi32(T34A, T3CA); res05 = _mm_packs_epi32(T35A, T3DA); res06 = _mm_packs_epi32(T36A, T3EA); res07 = _mm_packs_epi32(T37A, T3FA); } } // transpose matrix { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i E01, E02, E03, E04, E11, E12, E13, E14; __m128i O01, O02, O03, O04, O11, O12, O13, O14; __m128i T0, T1, T2, T3; tr0_0 = _mm_unpacklo_epi16(res00, res01); tr0_1 = _mm_unpackhi_epi16(res00, res01); tr0_2 = _mm_unpacklo_epi16(res02, res03); tr0_3 = _mm_unpackhi_epi16(res02, res03); tr0_4 = _mm_unpacklo_epi16(res04, res05); tr0_5 = _mm_unpackhi_epi16(res04, res05); tr0_6 = _mm_unpacklo_epi16(res06, res07); tr0_7 = _mm_unpackhi_epi16(res06, res07); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_2); tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_2); tr1_2 = _mm_unpacklo_epi32(tr0_1, tr0_3); tr1_3 = _mm_unpackhi_epi32(tr0_1, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_6); tr1_5 = _mm_unpackhi_epi32(tr0_4, tr0_6); tr1_6 = _mm_unpacklo_epi32(tr0_5, tr0_7); tr1_7 = _mm_unpackhi_epi32(tr0_5, tr0_7); res00 = _mm_unpacklo_epi64(tr1_0, tr1_4); res02 = _mm_unpackhi_epi64(tr1_0, tr1_4); res04 = _mm_unpacklo_epi64(tr1_1, tr1_5); res06 = _mm_unpackhi_epi64(tr1_1, tr1_5); res01 = _mm_unpacklo_epi64(tr1_2, tr1_6); res03 = _mm_unpackhi_epi64(tr1_2, tr1_6); res05 = _mm_unpacklo_epi64(tr1_3, tr1_7); res07 = _mm_unpackhi_epi64(tr1_3, tr1_7); c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 T0 = _mm_unpacklo_epi16(res00, res04); E01 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E11 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(res00, res04); E02 = _mm_add_epi32(_mm_madd_epi16(T1, c16_p32_p32), c32_rnd); E12 = _mm_add_epi32(_mm_madd_epi16(T1, c16_n32_p32), c32_rnd); T0 = _mm_unpacklo_epi16(res01, res05); E03 = _mm_add_epi32(_mm_madd_epi16(T0, c16_p32_p32), c32_rnd); E13 = _mm_add_epi32(_mm_madd_epi16(T0, c16_n32_p32), c32_rnd); T1 = _mm_unpackhi_epi16(res01, res05); E04 = _mm_add_epi32(_mm_madd_epi16(T1, c16_p32_p32), c32_rnd); E14 = _mm_add_epi32(_mm_madd_epi16(T1, c16_n32_p32), c32_rnd); T0 = _mm_unpacklo_epi16(res02, res06); O01 = _mm_madd_epi16(T0, c16_p17_p42); O11 = _mm_madd_epi16(T0, c16_n42_p17); T1 = _mm_unpackhi_epi16(res02, res06); O02 = _mm_madd_epi16(T1, c16_p17_p42); O12 = _mm_madd_epi16(T1, c16_n42_p17); T0 = _mm_unpacklo_epi16(res03, res07); O03 = _mm_madd_epi16(T0, c16_p17_p42); O13 = _mm_madd_epi16(T0, c16_n42_p17); T1 = _mm_unpackhi_epi16(res03, res07); O04 = _mm_madd_epi16(T1, c16_p17_p42); O14 = _mm_madd_epi16(T1, c16_n42_p17); res00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E01, O01), shift2), _mm_srai_epi32(_mm_add_epi32(E02, O02), shift2)); res01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E03, O03), shift2), _mm_srai_epi32(_mm_add_epi32(E04, O04), shift2)); res06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E01, O01), shift2), _mm_srai_epi32(_mm_sub_epi32(E02, O02), shift2)); res07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E03, O03), shift2), _mm_srai_epi32(_mm_sub_epi32(E04, O04), shift2)); res02 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E11, O11), shift2), _mm_srai_epi32(_mm_add_epi32(E12, O12), shift2)); res03 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E13, O13), shift2), _mm_srai_epi32(_mm_add_epi32(E14, O14), shift2)); res04 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E11, O11), shift2), _mm_srai_epi32(_mm_sub_epi32(E12, O12), shift2)); res05 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E13, O13), shift2), _mm_srai_epi32(_mm_sub_epi32(E14, O14), shift2)); T0 = _mm_unpacklo_epi16(res00, res02); T1 = _mm_unpackhi_epi16(res00, res02); T2 = _mm_unpacklo_epi16(res04, res06); T3 = _mm_unpackhi_epi16(res04, res06); res00 = _mm_unpacklo_epi32(T0, T2); res02 = _mm_unpackhi_epi32(T0, T2); res04 = _mm_unpacklo_epi32(T1, T3); res06 = _mm_unpackhi_epi32(T1, T3); T0 = _mm_unpacklo_epi16(res01, res03); T1 = _mm_unpackhi_epi16(res01, res03); T2 = _mm_unpacklo_epi16(res05, res07); T3 = _mm_unpackhi_epi16(res05, res07); res01 = _mm_unpacklo_epi32(T0, T2); res03 = _mm_unpackhi_epi32(T0, T2); res05 = _mm_unpacklo_epi32(T1, T3); res07 = _mm_unpackhi_epi32(T1, T3); } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); res00 = _mm_max_epi16(_mm_min_epi16(res00, max_val), min_val); res02 = _mm_max_epi16(_mm_min_epi16(res02, max_val), min_val); res04 = _mm_max_epi16(_mm_min_epi16(res04, max_val), min_val); res06 = _mm_max_epi16(_mm_min_epi16(res06, max_val), min_val); res01 = _mm_max_epi16(_mm_min_epi16(res01, max_val), min_val); res03 = _mm_max_epi16(_mm_min_epi16(res03, max_val), min_val); res05 = _mm_max_epi16(_mm_min_epi16(res05, max_val), min_val); res07 = _mm_max_epi16(_mm_min_epi16(res07, max_val), min_val); } // store if (i_dst == 4) { _mm_store_si128((__m128i*)(dst + 0 * 4), res00); _mm_store_si128((__m128i*)(dst + 2 * 4), res02); _mm_store_si128((__m128i*)(dst + 4 * 4), res04); _mm_store_si128((__m128i*)(dst + 6 * 4), res06); _mm_store_si128((__m128i*)(dst + 8 * 4), res01); _mm_store_si128((__m128i*)(dst + 10 * 4), res03); _mm_store_si128((__m128i*)(dst + 12 * 4), res05); _mm_store_si128((__m128i*)(dst + 14 * 4), res07); } else { _mm_storel_epi64((__m128i*)(dst + 0 * i_dst), res00); _mm_storeh_pi ((__m64 *)(dst + 1 * i_dst), _mm_castsi128_ps(res00)); _mm_storel_epi64((__m128i*)(dst + 2 * i_dst), res02); _mm_storeh_pi ((__m64 *)(dst + 3 * i_dst), _mm_castsi128_ps(res02)); _mm_storel_epi64((__m128i*)(dst + 4 * i_dst), res04); _mm_storeh_pi ((__m64 *)(dst + 5 * i_dst), _mm_castsi128_ps(res04)); _mm_storel_epi64((__m128i*)(dst + 6 * i_dst), res06); _mm_storeh_pi ((__m64 *)(dst + 7 * i_dst), _mm_castsi128_ps(res06)); _mm_storel_epi64((__m128i*)(dst + 8 * i_dst), res01); _mm_storeh_pi ((__m64 *)(dst + 9 * i_dst), _mm_castsi128_ps(res01)); _mm_storel_epi64((__m128i*)(dst + 10 * i_dst), res03); _mm_storeh_pi ((__m64 *)(dst + 11 * i_dst), _mm_castsi128_ps(res03)); _mm_storel_epi64((__m128i*)(dst + 12 * i_dst), res05); _mm_storeh_pi ((__m64 *)(dst + 13 * i_dst), _mm_castsi128_ps(res05)); _mm_storel_epi64((__m128i*)(dst + 14 * i_dst), res07); _mm_storeh_pi ((__m64 *)(dst + 15 * i_dst), _mm_castsi128_ps(res07)); } } /* --------------------------------------------------------------------------- */ void idct_4x16_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ4x8зϵ idct_4x16_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_4x16_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ4x4зϵ idct_4x16_half_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_16x4_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift1 - 1)); // add1 // DCT1 __m128i in00[2], in01[2], in02[2], in03[2]; __m128i res00[2], res01[2], res02[2], res03[2]; int i, part; for (i = 0; i < 2; i++) { const int offset = (i << 3); in00[i] = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]); // [07 06 05 04 03 02 01 00] in01[i] = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]); // [17 16 15 14 13 12 11 10] in02[i] = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]); // [27 26 25 24 23 22 21 20] in03[i] = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]); // [37 36 35 34 33 32 31 30] } for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); const __m128i T_00_01A = _mm_unpacklo_epi16(in00[part], in02[part]); const __m128i T_00_01B = _mm_unpackhi_epi16(in00[part], in02[part]); __m128i E0A, E0B, E1A, E1B, O0A, O0B, O1A, O1B; E0A = _mm_add_epi32(_mm_madd_epi16(T_00_01A, c16_p32_p32), c32_rnd); E1A = _mm_add_epi32(_mm_madd_epi16(T_00_01A, c16_n32_p32), c32_rnd); E0B = _mm_add_epi32(_mm_madd_epi16(T_00_01B, c16_p32_p32), c32_rnd); E1B = _mm_add_epi32(_mm_madd_epi16(T_00_01B, c16_n32_p32), c32_rnd); O0A = _mm_madd_epi16(T_00_00A, c16_p17_p42); O1A = _mm_madd_epi16(T_00_00A, c16_n42_p17); O0B = _mm_madd_epi16(T_00_00B, c16_p17_p42); O1B = _mm_madd_epi16(T_00_00B, c16_n42_p17); res00[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0A, O0A), 5), _mm_srai_epi32(_mm_add_epi32(E0B, O0B), 5)); res03[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0A, O0A), 5), _mm_srai_epi32(_mm_sub_epi32(E0B, O0B), 5)); res01[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1A, O1A), 5), _mm_srai_epi32(_mm_add_epi32(E1B, O1B), 5)); res02[part] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1A, O1A), 5), _mm_srai_epi32(_mm_sub_epi32(E1B, O1B), 5)); } // transpose matrix { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; tr0_0 = _mm_unpacklo_epi16(res00[0], res01[0]); tr0_1 = _mm_unpacklo_epi16(res02[0], res03[0]); tr0_2 = _mm_unpackhi_epi16(res00[0], res01[0]); tr0_3 = _mm_unpackhi_epi16(res02[0], res03[0]); tr0_4 = _mm_unpacklo_epi16(res00[1], res01[1]); tr0_5 = _mm_unpacklo_epi16(res02[1], res03[1]); tr0_6 = _mm_unpackhi_epi16(res00[1], res01[1]); tr0_7 = _mm_unpackhi_epi16(res02[1], res03[1]); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // second fft c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 { const __m128i T_00_00A = _mm_unpackhi_epi16(tr1_0, tr1_2); // [33 13 32 12 31 11 30 10] const __m128i T_00_01A = _mm_unpackhi_epi16(tr1_1, tr1_3); // [ ] const __m128i T_00_02A = _mm_unpackhi_epi16(tr1_4, tr1_6); // [ ] const __m128i T_00_03A = _mm_unpackhi_epi16(tr1_5, tr1_7); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(tr1_2, tr1_3); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(tr1_6, tr1_7); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(tr1_1, tr1_5); // [ ]row const __m128i T_00_07A = _mm_unpacklo_epi16(tr1_0, tr1_4); // [83 03 82 02 81 01 81 00] row08 row00 __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EEO0A, EEO1A; __m128i EEE0A, EEE1A; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ row = _mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)), \ _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315))); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), shift2); // E0 + O0 + rnd [30 20 10 00] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), shift2); // E1 + O1 + rnd [31 21 11 01] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), shift2); // E2 + O2 + rnd [32 22 12 02] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), shift2); // E3 + O3 + rnd [33 23 13 03] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), shift2); // E4 [33 24 14 04] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), shift2); // E5 [35 25 15 05] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), shift2); // E6 [36 26 16 06] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), shift2); // E7 [37 27 17 07] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), shift2); // E7 [30 20 10 00] x8 const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), shift2); // E6 [31 21 11 01] x9 const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), shift2); // E5 [32 22 12 02] xA const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), shift2); // E4 [33 23 13 03] xB const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), shift2); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), shift2); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), shift2); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), shift2); // E0 - O0 + rnd [37 27 17 07] xF res00[0] = _mm_packs_epi32(T30A, T38A); res01[0] = _mm_packs_epi32(T31A, T39A); res02[0] = _mm_packs_epi32(T32A, T3AA); res03[0] = _mm_packs_epi32(T33A, T3BA); res00[1] = _mm_packs_epi32(T34A, T3CA); res01[1] = _mm_packs_epi32(T35A, T3DA); res02[1] = _mm_packs_epi32(T36A, T3EA); res03[1] = _mm_packs_epi32(T37A, T3FA); } } // transpose matrix tr0_0 = _mm_unpacklo_epi16(res00[0], res01[0]); tr0_1 = _mm_unpacklo_epi16(res02[0], res03[0]); tr0_2 = _mm_unpackhi_epi16(res00[0], res01[0]); tr0_3 = _mm_unpackhi_epi16(res02[0], res03[0]); tr0_4 = _mm_unpacklo_epi16(res00[1], res01[1]); tr0_5 = _mm_unpacklo_epi16(res02[1], res03[1]); tr0_6 = _mm_unpackhi_epi16(res00[1], res01[1]); tr0_7 = _mm_unpackhi_epi16(res02[1], res03[1]); tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); res00[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); res01[0] = _mm_unpackhi_epi64(tr1_0, tr1_4); res02[0] = _mm_unpacklo_epi64(tr1_2, tr1_6); res03[0] = _mm_unpackhi_epi64(tr1_2, tr1_6); res00[1] = _mm_unpacklo_epi64(tr1_1, tr1_5); res01[1] = _mm_unpackhi_epi64(tr1_1, tr1_5); res02[1] = _mm_unpacklo_epi64(tr1_3, tr1_7); res03[1] = _mm_unpackhi_epi64(tr1_3, tr1_7); // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); res00[0] = _mm_max_epi16(_mm_min_epi16(res00[0], max_val), min_val); res01[0] = _mm_max_epi16(_mm_min_epi16(res01[0], max_val), min_val); res02[0] = _mm_max_epi16(_mm_min_epi16(res02[0], max_val), min_val); res03[0] = _mm_max_epi16(_mm_min_epi16(res03[0], max_val), min_val); res00[1] = _mm_max_epi16(_mm_min_epi16(res00[1], max_val), min_val); res01[1] = _mm_max_epi16(_mm_min_epi16(res01[1], max_val), min_val); res02[1] = _mm_max_epi16(_mm_min_epi16(res02[1], max_val), min_val); res03[1] = _mm_max_epi16(_mm_min_epi16(res03[1], max_val), min_val); } } _mm_storeu_si128((__m128i*)(dst + 0 * i_dst ), res00[0]); _mm_storeu_si128((__m128i*)(dst + 0 * i_dst + 8), res00[1]); _mm_storeu_si128((__m128i*)(dst + 1 * i_dst ), res01[0]); _mm_storeu_si128((__m128i*)(dst + 1 * i_dst + 8), res01[1]); _mm_storeu_si128((__m128i*)(dst + 2 * i_dst ), res02[0]); _mm_storeu_si128((__m128i*)(dst + 2 * i_dst + 8), res02[1]); _mm_storeu_si128((__m128i*)(dst + 3 * i_dst ), res03[0]); _mm_storeu_si128((__m128i*)(dst + 3 * i_dst + 8), res03[1]); } /* --------------------------------------------------------------------------- */ void idct_16x4_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ8x4зϵ idct_16x4_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_16x4_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ4x4зϵ idct_16x4_half_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_8x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // const int shift1 = 5; const int shift2 = 20 - g_bit_depth; // const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; __m128i S0, S1, S2, S3, S4, S5, S6, S7; __m128i mAdd, T0, T1, T2, T3; __m128i E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l; __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l; __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; __m128i T00, T01, T02, T03, T04, T05, T06, T07; mAdd = _mm_set1_epi32(16); // add1 S1 = _mm_load_si128((__m128i*)&src[8]); S3 = _mm_load_si128((__m128i*)&src[24]); T0 = _mm_unpacklo_epi16(S1, S3); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T1 = _mm_unpackhi_epi16(S1, S3); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); S5 = _mm_load_si128((__m128i*)&src[40]); S7 = _mm_load_si128((__m128i*)&src[56]); T2 = _mm_unpacklo_epi16(S5, S7); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); T3 = _mm_unpackhi_epi16(S5, S7); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ S0 = _mm_load_si128((__m128i*)&src[0]); S4 = _mm_load_si128((__m128i*)&src[32]); T0 = _mm_unpacklo_epi16(S0, S4); EE0l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); T1 = _mm_unpackhi_epi16(S0, S4); EE0h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ S2 = _mm_load_si128((__m128i*)&src[16]); S6 = _mm_load_si128((__m128i*)&src[48]); T0 = _mm_unpacklo_epi16(S2, S6); E00l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); T1 = _mm_unpackhi_epi16(S2, S6); E00h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, mAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, mAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, mAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, mAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, mAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, mAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, mAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, mAdd); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // ״η任λ S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); S2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 5)); S5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 5)); S3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 5)); S4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 5)); /* Inverse matrix */ E0l = _mm_unpacklo_epi16(S0, S4); E1l = _mm_unpacklo_epi16(S1, S5); E2l = _mm_unpacklo_epi16(S2, S6); E3l = _mm_unpacklo_epi16(S3, S7); O0l = _mm_unpackhi_epi16(S0, S4); O1l = _mm_unpackhi_epi16(S1, S5); O2l = _mm_unpackhi_epi16(S2, S6); O3l = _mm_unpackhi_epi16(S3, S7); T0 = _mm_unpacklo_epi16(E0l, E2l); T1 = _mm_unpacklo_epi16(E1l, E3l); S0 = _mm_unpacklo_epi16(T0, T1); S1 = _mm_unpackhi_epi16(T0, T1); T2 = _mm_unpackhi_epi16(E0l, E2l); T3 = _mm_unpackhi_epi16(E1l, E3l); S2 = _mm_unpacklo_epi16(T2, T3); S3 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi16(O0l, O2l); T1 = _mm_unpacklo_epi16(O1l, O3l); S4 = _mm_unpacklo_epi16(T0, T1); S5 = _mm_unpackhi_epi16(T0, T1); T2 = _mm_unpackhi_epi16(O0l, O2l); T3 = _mm_unpackhi_epi16(O1l, O3l); S6 = _mm_unpacklo_epi16(T2, T3); S7 = _mm_unpackhi_epi16(T2, T3); mAdd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 T0 = _mm_unpacklo_epi16(S1, S3); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T1 = _mm_unpackhi_epi16(S1, S3); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); T2 = _mm_unpacklo_epi16(S5, S7); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); T3 = _mm_unpackhi_epi16(S5, S7); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(T2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(T3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); T0 = _mm_unpacklo_epi16(S0, S4); T1 = _mm_unpackhi_epi16(S0, S4); EE0l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE0h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); T0 = _mm_unpacklo_epi16(S2, S6); T1 = _mm_unpackhi_epi16(S2, S6); E00l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E00h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(T0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(T1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, mAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, mAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, mAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, mAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, mAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, mAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, mAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, mAdd); S0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift2), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift2)); S7 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift2), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift2)); S1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift2), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift2)); S6 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift2), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift2)); S2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift2), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift2)); S5 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift2), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift2)); S3 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift2), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift2)); S4 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift2), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift2)); // [07 06 05 04 03 02 01 00] // [17 16 15 14 13 12 11 10] // [27 26 25 24 23 22 21 20] // [37 36 35 34 33 32 31 30] // [47 46 45 44 43 42 41 40] // [57 56 55 54 53 52 51 50] // [67 66 65 64 63 62 61 60] // [77 76 75 74 73 72 71 70] T00 = _mm_unpacklo_epi16(S0, S1); // [13 03 12 02 11 01 10 00] T01 = _mm_unpackhi_epi16(S0, S1); // [17 07 16 06 15 05 14 04] T02 = _mm_unpacklo_epi16(S2, S3); // [33 23 32 22 31 21 30 20] T03 = _mm_unpackhi_epi16(S2, S3); // [37 27 36 26 35 25 34 24] T04 = _mm_unpacklo_epi16(S4, S5); // [53 43 52 42 51 41 50 40] T05 = _mm_unpackhi_epi16(S4, S5); // [57 47 56 46 55 45 54 44] T06 = _mm_unpacklo_epi16(S6, S7); // [73 63 72 62 71 61 70 60] T07 = _mm_unpackhi_epi16(S6, S7); // [77 67 76 66 75 65 74 64] // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); T00 = _mm_max_epi16(_mm_min_epi16(T00, max_val), min_val); T01 = _mm_max_epi16(_mm_min_epi16(T01, max_val), min_val); T02 = _mm_max_epi16(_mm_min_epi16(T02, max_val), min_val); T03 = _mm_max_epi16(_mm_min_epi16(T03, max_val), min_val); T04 = _mm_max_epi16(_mm_min_epi16(T04, max_val), min_val); T05 = _mm_max_epi16(_mm_min_epi16(T05, max_val), min_val); T06 = _mm_max_epi16(_mm_min_epi16(T06, max_val), min_val); T07 = _mm_max_epi16(_mm_min_epi16(T07, max_val), min_val); } { __m128i T10, T11, T12, T13; T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00] T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02] T12 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40] T13 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42] _mm_store_si128((__m128i*)(dst + 0 * i_dst), _mm_unpacklo_epi64(T10, T12)); // [70 60 50 40 30 20 10 00] _mm_store_si128((__m128i*)(dst + 1 * i_dst), _mm_unpackhi_epi64(T10, T12)); // [71 61 51 41 31 21 11 01] _mm_store_si128((__m128i*)(dst + 2 * i_dst), _mm_unpacklo_epi64(T11, T13)); // [72 62 52 42 32 22 12 02] _mm_store_si128((__m128i*)(dst + 3 * i_dst), _mm_unpackhi_epi64(T11, T13)); // [73 63 53 43 33 23 13 03] T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04] T12 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44] T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06] T13 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36] _mm_store_si128((__m128i*)(dst + 4 * i_dst), _mm_unpacklo_epi64(T10, T12)); // [74 64 54 44 34 24 14 04] _mm_store_si128((__m128i*)(dst + 5 * i_dst), _mm_unpackhi_epi64(T10, T12)); // [75 65 55 45 35 25 15 05] _mm_store_si128((__m128i*)(dst + 6 * i_dst), _mm_unpacklo_epi64(T11, T13)); // [76 66 56 46 36 26 16 06] _mm_store_si128((__m128i*)(dst + 7 * i_dst), _mm_unpackhi_epi64(T11, T13)); // [77 67 57 47 37 27 17 07] } } /* --------------------------------------------------------------------------- */ void idct_8x8_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ4x4зϵ idct_8x8_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_8x8_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ2x2зϵ idct_8x8_half_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_16x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth; //const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); int i, pass, part; int nShift = shift1; __m128i c32_rnd = _mm_set1_epi32((1 << shift1) >> 1); // add1 // DCT1 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]; __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2]; __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; for (i = 0; i < 2; i++) { const int offset = (i << 3); in00[i] = _mm_load_si128((const __m128i*)&src[ 0 * 16 + offset]); // [07 06 05 04 03 02 01 00] in01[i] = _mm_load_si128((const __m128i*)&src[ 1 * 16 + offset]); // [17 16 15 14 13 12 11 10] in02[i] = _mm_load_si128((const __m128i*)&src[ 2 * 16 + offset]); // [27 26 25 24 23 22 21 20] in03[i] = _mm_load_si128((const __m128i*)&src[ 3 * 16 + offset]); // [37 36 35 34 33 32 31 30] in04[i] = _mm_load_si128((const __m128i*)&src[ 4 * 16 + offset]); // [47 46 45 44 43 42 41 40] in05[i] = _mm_load_si128((const __m128i*)&src[ 5 * 16 + offset]); // [57 56 55 54 53 52 51 50] in06[i] = _mm_load_si128((const __m128i*)&src[ 6 * 16 + offset]); // [67 66 65 64 63 62 61 60] in07[i] = _mm_load_si128((const __m128i*)&src[ 7 * 16 + offset]); // [77 76 75 74 73 72 71 70] in08[i] = _mm_load_si128((const __m128i*)&src[ 8 * 16 + offset]); in09[i] = _mm_load_si128((const __m128i*)&src[ 9 * 16 + offset]); in10[i] = _mm_load_si128((const __m128i*)&src[10 * 16 + offset]); in11[i] = _mm_load_si128((const __m128i*)&src[11 * 16 + offset]); in12[i] = _mm_load_si128((const __m128i*)&src[12 * 16 + offset]); in13[i] = _mm_load_si128((const __m128i*)&src[13 * 16 + offset]); in14[i] = _mm_load_si128((const __m128i*)&src[14 * 16 + offset]); in15[i] = _mm_load_si128((const __m128i*)&src[15 * 16 + offset]); } for (pass = 0; pass < 2; pass++) { for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ]row const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], in08[part]); // [83 03 82 02 81 01 81 00] row08 row00 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], in08[part]); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EO0B, EO1B, EO2B, EO3B; __m128i EEO0A, EEO1A; __m128i EEO0B, EEO1B; __m128i EEE0A, EEE1A; __m128i EEE0B, EEE1B; __m128i T00, T01; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row0911, c0911), _mm_madd_epi16(row1315, c1315)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7B) #undef COMPUTE_ROW EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_p38_p44), _mm_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_p38_p44), _mm_madd_epi16(T_00_05B, c16_p09_p25)); EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n09_p38), _mm_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n09_p38), _mm_madd_epi16(T_00_05B, c16_n25_n44)); EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n44_p25), _mm_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n44_p25), _mm_madd_epi16(T_00_05B, c16_p38_p09)); EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_04A, c16_n25_p09), _mm_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_04B, c16_n25_p09), _mm_madd_epi16(T_00_05B, c16_n44_p38)); EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm_madd_epi16(T_00_06B, c16_n42_p17); EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm_madd_epi16(T_00_07B, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T10B = _mm_add_epi32(_mm_add_epi32(EE0B, EO0B), c32_rnd); const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T11B = _mm_add_epi32(_mm_add_epi32(EE1B, EO1B), c32_rnd); const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T12B = _mm_add_epi32(_mm_add_epi32(EE2B, EO2B), c32_rnd); const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T13B = _mm_add_epi32(_mm_add_epi32(EE3B, EO3B), c32_rnd); const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T14B = _mm_add_epi32(_mm_sub_epi32(EE3B, EO3B), c32_rnd); const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T15B = _mm_add_epi32(_mm_sub_epi32(EE2B, EO2B), c32_rnd); const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T16B = _mm_add_epi32(_mm_sub_epi32(EE1B, EO1B), c32_rnd); const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T17B = _mm_add_epi32(_mm_sub_epi32(EE0B, EO0B), c32_rnd); const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), nShift); // E0 + O0 + rnd [30 20 10 00] const __m128i T30B = _mm_srai_epi32(_mm_add_epi32(T10B, O0B), nShift); // [70 60 50 40] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), nShift); // E1 + O1 + rnd [31 21 11 01] const __m128i T31B = _mm_srai_epi32(_mm_add_epi32(T11B, O1B), nShift); // [71 61 51 41] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), nShift); // E2 + O2 + rnd [32 22 12 02] const __m128i T32B = _mm_srai_epi32(_mm_add_epi32(T12B, O2B), nShift); // [72 62 52 42] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), nShift); // E3 + O3 + rnd [33 23 13 03] const __m128i T33B = _mm_srai_epi32(_mm_add_epi32(T13B, O3B), nShift); // [73 63 53 43] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), nShift); // E4 [33 24 14 04] const __m128i T34B = _mm_srai_epi32(_mm_add_epi32(T14B, O4B), nShift); // [74 64 54 44] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), nShift); // E5 [35 25 15 05] const __m128i T35B = _mm_srai_epi32(_mm_add_epi32(T15B, O5B), nShift); // [75 65 55 45] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), nShift); // E6 [36 26 16 06] const __m128i T36B = _mm_srai_epi32(_mm_add_epi32(T16B, O6B), nShift); // [76 66 56 46] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), nShift); // E7 [37 27 17 07] const __m128i T37B = _mm_srai_epi32(_mm_add_epi32(T17B, O7B), nShift); // [77 67 57 47] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), nShift); // E7 [30 20 10 00] x8 const __m128i T38B = _mm_srai_epi32(_mm_sub_epi32(T17B, O7B), nShift); // [70 60 50 40] const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), nShift); // E6 [31 21 11 01] x9 const __m128i T39B = _mm_srai_epi32(_mm_sub_epi32(T16B, O6B), nShift); // [71 61 51 41] const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), nShift); // E5 [32 22 12 02] xA const __m128i T3AB = _mm_srai_epi32(_mm_sub_epi32(T15B, O5B), nShift); // [72 62 52 42] const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), nShift); // E4 [33 23 13 03] xB const __m128i T3BB = _mm_srai_epi32(_mm_sub_epi32(T14B, O4B), nShift); // [73 63 53 43] const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), nShift); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3CB = _mm_srai_epi32(_mm_sub_epi32(T13B, O3B), nShift); // [74 64 54 44] const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), nShift); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3DB = _mm_srai_epi32(_mm_sub_epi32(T12B, O2B), nShift); // [75 65 55 45] const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), nShift); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3EB = _mm_srai_epi32(_mm_sub_epi32(T11B, O1B), nShift); // [76 66 56 46] const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), nShift); // E0 - O0 + rnd [37 27 17 07] xF const __m128i T3FB = _mm_srai_epi32(_mm_sub_epi32(T10B, O0B), nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] } } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) #undef TRANSPOSE_8x8_16BIT } nShift = shift2; c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); in00[0] = _mm_max_epi16(_mm_min_epi16(in00[0], max_val), min_val); in00[1] = _mm_max_epi16(_mm_min_epi16(in00[1], max_val), min_val); in01[0] = _mm_max_epi16(_mm_min_epi16(in01[0], max_val), min_val); in01[1] = _mm_max_epi16(_mm_min_epi16(in01[1], max_val), min_val); in02[0] = _mm_max_epi16(_mm_min_epi16(in02[0], max_val), min_val); in02[1] = _mm_max_epi16(_mm_min_epi16(in02[1], max_val), min_val); in03[0] = _mm_max_epi16(_mm_min_epi16(in03[0], max_val), min_val); in03[1] = _mm_max_epi16(_mm_min_epi16(in03[1], max_val), min_val); in04[0] = _mm_max_epi16(_mm_min_epi16(in04[0], max_val), min_val); in04[1] = _mm_max_epi16(_mm_min_epi16(in04[1], max_val), min_val); in05[0] = _mm_max_epi16(_mm_min_epi16(in05[0], max_val), min_val); in05[1] = _mm_max_epi16(_mm_min_epi16(in05[1], max_val), min_val); in06[0] = _mm_max_epi16(_mm_min_epi16(in06[0], max_val), min_val); in06[1] = _mm_max_epi16(_mm_min_epi16(in06[1], max_val), min_val); in07[0] = _mm_max_epi16(_mm_min_epi16(in07[0], max_val), min_val); in07[1] = _mm_max_epi16(_mm_min_epi16(in07[1], max_val), min_val); in08[0] = _mm_max_epi16(_mm_min_epi16(in08[0], max_val), min_val); in08[1] = _mm_max_epi16(_mm_min_epi16(in08[1], max_val), min_val); in09[0] = _mm_max_epi16(_mm_min_epi16(in09[0], max_val), min_val); in09[1] = _mm_max_epi16(_mm_min_epi16(in09[1], max_val), min_val); in10[0] = _mm_max_epi16(_mm_min_epi16(in10[0], max_val), min_val); in10[1] = _mm_max_epi16(_mm_min_epi16(in10[1], max_val), min_val); in11[0] = _mm_max_epi16(_mm_min_epi16(in11[0], max_val), min_val); in11[1] = _mm_max_epi16(_mm_min_epi16(in11[1], max_val), min_val); in12[0] = _mm_max_epi16(_mm_min_epi16(in12[0], max_val), min_val); in12[1] = _mm_max_epi16(_mm_min_epi16(in12[1], max_val), min_val); in13[0] = _mm_max_epi16(_mm_min_epi16(in13[0], max_val), min_val); in13[1] = _mm_max_epi16(_mm_min_epi16(in13[1], max_val), min_val); in14[0] = _mm_max_epi16(_mm_min_epi16(in14[0], max_val), min_val); in14[1] = _mm_max_epi16(_mm_min_epi16(in14[1], max_val), min_val); in15[0] = _mm_max_epi16(_mm_min_epi16(in15[0], max_val), min_val); in15[1] = _mm_max_epi16(_mm_min_epi16(in15[1], max_val), min_val); } // store _mm_store_si128((__m128i*)(dst + 0 * i_dst + 0), in00[0]); _mm_store_si128((__m128i*)(dst + 0 * i_dst + 8), in00[1]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 0), in01[0]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 8), in01[1]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 0), in02[0]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 8), in02[1]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 0), in03[0]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 8), in03[1]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 0), in04[0]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 8), in04[1]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 0), in05[0]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 8), in05[1]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 0), in06[0]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 8), in06[1]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 0), in07[0]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 8), in07[1]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 0), in08[0]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 8), in08[1]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 0), in09[0]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 8), in09[1]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 0), in10[0]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 8), in10[1]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 0), in11[0]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 8), in11[1]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 0), in12[0]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 8), in12[1]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 0), in13[0]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 8), in13[1]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 0), in14[0]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 8), in14[1]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 0), in15[0]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 8), in15[1]); } /* --------------------------------------------------------------------------- */ void idct_16x16_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ8x8зϵ //idct_16x16_sse128(src, dst, i_dst); const int shift1 = 5; const int shift2 = 20 - g_bit_depth; //const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address 1 3 const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); // 5 7 const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); //row0 2 6 const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); //row1 2 6 const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); //row2 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row3 const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); //row0 4 12 const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); //row1 4 12 const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); //row1 0 8 const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); //row0 0 8 int part; int nShift = shift1; __m128i c32_rnd = _mm_set1_epi32((1 << shift1) >> 1); // add1 __m128i Zero_8 = _mm_set1_epi16(0); // DCT1 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]; __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2]; __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; in00[0] = _mm_load_si128((const __m128i*)&src[0 * 16]); // [07 06 05 04 03 02 01 00] in01[0] = _mm_load_si128((const __m128i*)&src[1 * 16]); // [17 16 15 14 13 12 11 10] in02[0] = _mm_load_si128((const __m128i*)&src[2 * 16]); // [27 26 25 24 23 22 21 20] in03[0] = _mm_load_si128((const __m128i*)&src[3 * 16]); // [37 36 35 34 33 32 31 30] in04[0] = _mm_load_si128((const __m128i*)&src[4 * 16]); // [47 46 45 44 43 42 41 40] in05[0] = _mm_load_si128((const __m128i*)&src[5 * 16]); // [57 56 55 54 53 52 51 50] in06[0] = _mm_load_si128((const __m128i*)&src[6 * 16]); // [67 66 65 64 63 62 61 60] in07[0] = _mm_load_si128((const __m128i*)&src[7 * 16]); // [77 76 75 74 73 72 71 70] //pass=1 { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[0], in03[0]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[0], in03[0]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[0], in07[0]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[0], in07[0]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[0], in06[0]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in02[0], in06[0]); // [ ] //4 12 const __m128i T_00_06A = _mm_unpacklo_epi16(in04[0], Zero_8); // [ ]row const __m128i T_00_06B = _mm_unpackhi_epi16(in04[0], Zero_8); // [ ] //0 8 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[0], Zero_8); // [83 03 82 02 81 01 81 00] row08 row00 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[0], Zero_8); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EO0B, EO1B, EO2B, EO3B; __m128i EEO0A, EEO1A; __m128i EEO0B, EEO1B; __m128i EEE0A, EEE1A; __m128i EEE0B, EEE1B; //1 3 5 7 #define COMPUTE_ROW(row0103, row0507, c0103, c0507, row) \ row = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); COMPUTE_ROW(T_00_00A, T_00_01A, c16_p43_p45, c16_p35_p40, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_p29_p43, c16_n21_p04, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_p04_p40, c16_n43_n35, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n21_p35, c16_p04_n43, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n40_p29, c16_p45_n13, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n45_p21, c16_p13_p29, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n35_p13, c16_n40_p45, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n13_p04, c16_n29_p21, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p43_p45, c16_p35_p40, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p29_p43, c16_n21_p04, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p04_p40, c16_n43_n35, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n21_p35, c16_p04_n43, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n40_p29, c16_p45_n13, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n45_p21, c16_p13_p29, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n35_p13, c16_n40_p45, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n13_p04, c16_n29_p21, O7B) #undef COMPUTE_ROW //2 6 EO0A = _mm_madd_epi16(T_00_04A, c16_p38_p44); // EO0 EO0B = _mm_madd_epi16(T_00_04B, c16_p38_p44); EO1A = _mm_madd_epi16(T_00_04A, c16_n09_p38); // EO1 EO1B = _mm_madd_epi16(T_00_04B, c16_n09_p38); EO2A = _mm_madd_epi16(T_00_04A, c16_n44_p25); // EO2 EO2B = _mm_madd_epi16(T_00_04B, c16_n44_p25); EO3A = _mm_madd_epi16(T_00_04A, c16_n25_p09); // EO3 EO3B = _mm_madd_epi16(T_00_04B, c16_n25_p09); //4 12 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm_madd_epi16(T_00_06B, c16_n42_p17); //0 8 EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm_madd_epi16(T_00_07B, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T10B = _mm_add_epi32(_mm_add_epi32(EE0B, EO0B), c32_rnd); const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T11B = _mm_add_epi32(_mm_add_epi32(EE1B, EO1B), c32_rnd); const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T12B = _mm_add_epi32(_mm_add_epi32(EE2B, EO2B), c32_rnd); const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T13B = _mm_add_epi32(_mm_add_epi32(EE3B, EO3B), c32_rnd); const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T14B = _mm_add_epi32(_mm_sub_epi32(EE3B, EO3B), c32_rnd); const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T15B = _mm_add_epi32(_mm_sub_epi32(EE2B, EO2B), c32_rnd); const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T16B = _mm_add_epi32(_mm_sub_epi32(EE1B, EO1B), c32_rnd); const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T17B = _mm_add_epi32(_mm_sub_epi32(EE0B, EO0B), c32_rnd); const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), nShift); // E0 + O0 + rnd [30 20 10 00] const __m128i T30B = _mm_srai_epi32(_mm_add_epi32(T10B, O0B), nShift); // [70 60 50 40] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), nShift); // E1 + O1 + rnd [31 21 11 01] const __m128i T31B = _mm_srai_epi32(_mm_add_epi32(T11B, O1B), nShift); // [71 61 51 41] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), nShift); // E2 + O2 + rnd [32 22 12 02] const __m128i T32B = _mm_srai_epi32(_mm_add_epi32(T12B, O2B), nShift); // [72 62 52 42] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), nShift); // E3 + O3 + rnd [33 23 13 03] const __m128i T33B = _mm_srai_epi32(_mm_add_epi32(T13B, O3B), nShift); // [73 63 53 43] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), nShift); // E4 [33 24 14 04] const __m128i T34B = _mm_srai_epi32(_mm_add_epi32(T14B, O4B), nShift); // [74 64 54 44] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), nShift); // E5 [35 25 15 05] const __m128i T35B = _mm_srai_epi32(_mm_add_epi32(T15B, O5B), nShift); // [75 65 55 45] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), nShift); // E6 [36 26 16 06] const __m128i T36B = _mm_srai_epi32(_mm_add_epi32(T16B, O6B), nShift); // [76 66 56 46] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), nShift); // E7 [37 27 17 07] const __m128i T37B = _mm_srai_epi32(_mm_add_epi32(T17B, O7B), nShift); // [77 67 57 47] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), nShift); // E7 [30 20 10 00] x8 const __m128i T38B = _mm_srai_epi32(_mm_sub_epi32(T17B, O7B), nShift); // [70 60 50 40] const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), nShift); // E6 [31 21 11 01] x9 const __m128i T39B = _mm_srai_epi32(_mm_sub_epi32(T16B, O6B), nShift); // [71 61 51 41] const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), nShift); // E5 [32 22 12 02] xA const __m128i T3AB = _mm_srai_epi32(_mm_sub_epi32(T15B, O5B), nShift); // [72 62 52 42] const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), nShift); // E4 [33 23 13 03] xB const __m128i T3BB = _mm_srai_epi32(_mm_sub_epi32(T14B, O4B), nShift); // [73 63 53 43] const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), nShift); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3CB = _mm_srai_epi32(_mm_sub_epi32(T13B, O3B), nShift); // [74 64 54 44] const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), nShift); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3DB = _mm_srai_epi32(_mm_sub_epi32(T12B, O2B), nShift); // [75 65 55 45] const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), nShift); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3EB = _mm_srai_epi32(_mm_sub_epi32(T11B, O1B), nShift); // [76 66 56 46] const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), nShift); // E0 - O0 + rnd [37 27 17 07] xF const __m128i T3FB = _mm_srai_epi32(_mm_sub_epi32(T10B, O0B), nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08[0] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] res09[0] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] res10[0] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11[0] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12[0] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13[0] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14[0] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15[0] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) #undef TRANSPOSE_8x8_16BIT } nShift = shift2; c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 } //pass=2 { for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] //4 12 const __m128i T_00_06A = _mm_unpacklo_epi16(in04[part], Zero_8); // [ ]row const __m128i T_00_06B = _mm_unpackhi_epi16(in04[part], Zero_8); // [ ] //0 8 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], Zero_8); // [83 03 82 02 81 01 81 00] row08 row00 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], Zero_8); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EO0B, EO1B, EO2B, EO3B; __m128i EEO0A, EEO1A; __m128i EEO0B, EEO1B; __m128i EEE0A, EEE1A; __m128i EEE0B, EEE1B; //1 3 5 7 #define COMPUTE_ROW(row0103, row0507, c0103, c0507, row) \ row = _mm_add_epi32(_mm_madd_epi16(row0103, c0103), _mm_madd_epi16(row0507, c0507)); COMPUTE_ROW(T_00_00A, T_00_01A, c16_p43_p45, c16_p35_p40, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_p29_p43, c16_n21_p04, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_p04_p40, c16_n43_n35, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n21_p35, c16_p04_n43, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n40_p29, c16_p45_n13, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n45_p21, c16_p13_p29, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n35_p13, c16_n40_p45, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, c16_n13_p04, c16_n29_p21, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p43_p45, c16_p35_p40, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p29_p43, c16_n21_p04, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_p04_p40, c16_n43_n35, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n21_p35, c16_p04_n43, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n40_p29, c16_p45_n13, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n45_p21, c16_p13_p29, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n35_p13, c16_n40_p45, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, c16_n13_p04, c16_n29_p21, O7B) #undef COMPUTE_ROW //2 6 EO0A = _mm_madd_epi16(T_00_04A, c16_p38_p44); // EO0 EO0B = _mm_madd_epi16(T_00_04B, c16_p38_p44); EO1A = _mm_madd_epi16(T_00_04A, c16_n09_p38); // EO1 EO1B = _mm_madd_epi16(T_00_04B, c16_n09_p38); EO2A = _mm_madd_epi16(T_00_04A, c16_n44_p25); // EO2 EO2B = _mm_madd_epi16(T_00_04B, c16_n44_p25); EO3A = _mm_madd_epi16(T_00_04A, c16_n25_p09); // EO3 EO3B = _mm_madd_epi16(T_00_04B, c16_n25_p09); //4 12 EEO0A = _mm_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm_madd_epi16(T_00_06B, c16_n42_p17); //0 8 EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm_madd_epi16(T_00_07B, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE3A = _mm_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m128i EE3B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE2A = _mm_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m128i EE2B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T10B = _mm_add_epi32(_mm_add_epi32(EE0B, EO0B), c32_rnd); const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T11B = _mm_add_epi32(_mm_add_epi32(EE1B, EO1B), c32_rnd); const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T12B = _mm_add_epi32(_mm_add_epi32(EE2B, EO2B), c32_rnd); const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T13B = _mm_add_epi32(_mm_add_epi32(EE3B, EO3B), c32_rnd); const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T14B = _mm_add_epi32(_mm_sub_epi32(EE3B, EO3B), c32_rnd); const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T15B = _mm_add_epi32(_mm_sub_epi32(EE2B, EO2B), c32_rnd); const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T16B = _mm_add_epi32(_mm_sub_epi32(EE1B, EO1B), c32_rnd); const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T17B = _mm_add_epi32(_mm_sub_epi32(EE0B, EO0B), c32_rnd); const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), nShift); // E0 + O0 + rnd [30 20 10 00] const __m128i T30B = _mm_srai_epi32(_mm_add_epi32(T10B, O0B), nShift); // [70 60 50 40] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), nShift); // E1 + O1 + rnd [31 21 11 01] const __m128i T31B = _mm_srai_epi32(_mm_add_epi32(T11B, O1B), nShift); // [71 61 51 41] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), nShift); // E2 + O2 + rnd [32 22 12 02] const __m128i T32B = _mm_srai_epi32(_mm_add_epi32(T12B, O2B), nShift); // [72 62 52 42] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), nShift); // E3 + O3 + rnd [33 23 13 03] const __m128i T33B = _mm_srai_epi32(_mm_add_epi32(T13B, O3B), nShift); // [73 63 53 43] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), nShift); // E4 [33 24 14 04] const __m128i T34B = _mm_srai_epi32(_mm_add_epi32(T14B, O4B), nShift); // [74 64 54 44] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), nShift); // E5 [35 25 15 05] const __m128i T35B = _mm_srai_epi32(_mm_add_epi32(T15B, O5B), nShift); // [75 65 55 45] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), nShift); // E6 [36 26 16 06] const __m128i T36B = _mm_srai_epi32(_mm_add_epi32(T16B, O6B), nShift); // [76 66 56 46] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), nShift); // E7 [37 27 17 07] const __m128i T37B = _mm_srai_epi32(_mm_add_epi32(T17B, O7B), nShift); // [77 67 57 47] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), nShift); // E7 [30 20 10 00] x8 const __m128i T38B = _mm_srai_epi32(_mm_sub_epi32(T17B, O7B), nShift); // [70 60 50 40] const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), nShift); // E6 [31 21 11 01] x9 const __m128i T39B = _mm_srai_epi32(_mm_sub_epi32(T16B, O6B), nShift); // [71 61 51 41] const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), nShift); // E5 [32 22 12 02] xA const __m128i T3AB = _mm_srai_epi32(_mm_sub_epi32(T15B, O5B), nShift); // [72 62 52 42] const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), nShift); // E4 [33 23 13 03] xB const __m128i T3BB = _mm_srai_epi32(_mm_sub_epi32(T14B, O4B), nShift); // [73 63 53 43] const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), nShift); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3CB = _mm_srai_epi32(_mm_sub_epi32(T13B, O3B), nShift); // [74 64 54 44] const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), nShift); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3DB = _mm_srai_epi32(_mm_sub_epi32(T12B, O2B), nShift); // [75 65 55 45] const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), nShift); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3EB = _mm_srai_epi32(_mm_sub_epi32(T11B, O1B), nShift); // [76 66 56 46] const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), nShift); // E0 - O0 + rnd [37 27 17 07] xF const __m128i T3FB = _mm_srai_epi32(_mm_sub_epi32(T10B, O0B), nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] } } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) #undef TRANSPOSE_8x8_16BIT } } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); in00[0] = _mm_max_epi16(_mm_min_epi16(in00[0], max_val), min_val); in00[1] = _mm_max_epi16(_mm_min_epi16(in00[1], max_val), min_val); in01[0] = _mm_max_epi16(_mm_min_epi16(in01[0], max_val), min_val); in01[1] = _mm_max_epi16(_mm_min_epi16(in01[1], max_val), min_val); in02[0] = _mm_max_epi16(_mm_min_epi16(in02[0], max_val), min_val); in02[1] = _mm_max_epi16(_mm_min_epi16(in02[1], max_val), min_val); in03[0] = _mm_max_epi16(_mm_min_epi16(in03[0], max_val), min_val); in03[1] = _mm_max_epi16(_mm_min_epi16(in03[1], max_val), min_val); in04[0] = _mm_max_epi16(_mm_min_epi16(in04[0], max_val), min_val); in04[1] = _mm_max_epi16(_mm_min_epi16(in04[1], max_val), min_val); in05[0] = _mm_max_epi16(_mm_min_epi16(in05[0], max_val), min_val); in05[1] = _mm_max_epi16(_mm_min_epi16(in05[1], max_val), min_val); in06[0] = _mm_max_epi16(_mm_min_epi16(in06[0], max_val), min_val); in06[1] = _mm_max_epi16(_mm_min_epi16(in06[1], max_val), min_val); in07[0] = _mm_max_epi16(_mm_min_epi16(in07[0], max_val), min_val); in07[1] = _mm_max_epi16(_mm_min_epi16(in07[1], max_val), min_val); in08[0] = _mm_max_epi16(_mm_min_epi16(in08[0], max_val), min_val); in08[1] = _mm_max_epi16(_mm_min_epi16(in08[1], max_val), min_val); in09[0] = _mm_max_epi16(_mm_min_epi16(in09[0], max_val), min_val); in09[1] = _mm_max_epi16(_mm_min_epi16(in09[1], max_val), min_val); in10[0] = _mm_max_epi16(_mm_min_epi16(in10[0], max_val), min_val); in10[1] = _mm_max_epi16(_mm_min_epi16(in10[1], max_val), min_val); in11[0] = _mm_max_epi16(_mm_min_epi16(in11[0], max_val), min_val); in11[1] = _mm_max_epi16(_mm_min_epi16(in11[1], max_val), min_val); in12[0] = _mm_max_epi16(_mm_min_epi16(in12[0], max_val), min_val); in12[1] = _mm_max_epi16(_mm_min_epi16(in12[1], max_val), min_val); in13[0] = _mm_max_epi16(_mm_min_epi16(in13[0], max_val), min_val); in13[1] = _mm_max_epi16(_mm_min_epi16(in13[1], max_val), min_val); in14[0] = _mm_max_epi16(_mm_min_epi16(in14[0], max_val), min_val); in14[1] = _mm_max_epi16(_mm_min_epi16(in14[1], max_val), min_val); in15[0] = _mm_max_epi16(_mm_min_epi16(in15[0], max_val), min_val); in15[1] = _mm_max_epi16(_mm_min_epi16(in15[1], max_val), min_val); } // store _mm_store_si128((__m128i*)(dst + 0 * i_dst + 0), in00[0]); _mm_store_si128((__m128i*)(dst + 0 * i_dst + 8), in00[1]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 0), in01[0]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 8), in01[1]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 0), in02[0]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 8), in02[1]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 0), in03[0]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 8), in03[1]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 0), in04[0]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 8), in04[1]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 0), in05[0]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 8), in05[1]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 0), in06[0]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 8), in06[1]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 0), in07[0]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 8), in07[1]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 0), in08[0]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 8), in08[1]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 0), in09[0]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 8), in09[1]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 0), in10[0]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 8), in10[1]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 0), in11[0]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 8), in11[1]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 0), in12[0]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 8), in12[1]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 0), in13[0]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 8), in13[1]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 0), in14[0]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 8), in14[1]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 0), in15[0]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 8), in15[1]); } /* --------------------------------------------------------------------------- */ void idct_16x16_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ4x4зϵ //idct_16x16_half_sse128(src, dst, i_dst); const int shift1 = 5; const int shift2 = 20 - g_bit_depth; //const int clip_depth1 = LIMIT_BIT; const int clip_depth2 = g_bit_depth + 1; const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); //row0 87high - 90low address 1 3 const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); //row1 const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); //row2 const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); //row3 const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); //row4 const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); //row5 const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); //row6 const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); //row7 const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); //row0 2 6 const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); //row1 2 6 const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); //row2 const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); //row3 // const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); //row0 4 12 // const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); //row1 4 12 const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); //row1 0 8 const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); //row0 0 8 int part; int nShift = shift1; __m128i c32_rnd = _mm_set1_epi32((1 << shift1) >> 1); // add1 __m128i Zero_8 = _mm_set1_epi16(0); __m128i add_zero = _mm_set1_epi32(0); // DCT1 __m128i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]; __m128i in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m128i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2]; __m128i res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; in00[0] = _mm_load_si128((const __m128i*)&src[0 * 16]); // [07 06 05 04 03 02 01 00] in01[0] = _mm_load_si128((const __m128i*)&src[1 * 16]); // [17 16 15 14 13 12 11 10] in02[0] = _mm_load_si128((const __m128i*)&src[2 * 16]); // [27 26 25 24 23 22 21 20] in03[0] = _mm_load_si128((const __m128i*)&src[3 * 16]); // [37 36 35 34 33 32 31 30] //pass=1 { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[0], in03[0]); // [33 13 32 12 31 11 30 10] // const __m128i T_00_00B = _mm_unpackhi_epi16(in01[0], in03[0]); // [37 17 36 16 35 15 34 14] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[0], Zero_8); // [ ] // const __m128i T_00_04B = _mm_unpackhi_epi16(in02[0], Zero_8); // [ ] //0 8 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[0], Zero_8); // [83 03 82 02 81 01 81 00] row08 row00 // const __m128i T_00_07B = _mm_unpackhi_epi16(in00[0], Zero_8); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EEE0A, EEE1A; //1 3 O0A = _mm_madd_epi16(T_00_00A, c16_p43_p45); O1A = _mm_madd_epi16(T_00_00A, c16_p29_p43); O2A = _mm_madd_epi16(T_00_00A, c16_p04_p40); O3A = _mm_madd_epi16(T_00_00A, c16_n21_p35); O4A = _mm_madd_epi16(T_00_00A, c16_n40_p29); O5A = _mm_madd_epi16(T_00_00A, c16_n45_p21); O6A = _mm_madd_epi16(T_00_00A, c16_n35_p13); O7A = _mm_madd_epi16(T_00_00A, c16_n13_p04); //2 6 EO0A = _mm_madd_epi16(T_00_04A, c16_p38_p44); // EO0 EO1A = _mm_madd_epi16(T_00_04A, c16_n09_p38); // EO1 EO2A = _mm_madd_epi16(T_00_04A, c16_n44_p25); // EO2 EO3A = _mm_madd_epi16(T_00_04A, c16_n25_p09); // EO3 //0 8 EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, add_zero); // EE0 = EEE0 + EEO0 const __m128i EE1A = _mm_add_epi32(EEE1A, add_zero); // EE1 = EEE1 + EEO1 const __m128i EE3A = _mm_sub_epi32(EEE0A, add_zero); // EE2 = EEE0 - EEO0 const __m128i EE2A = _mm_sub_epi32(EEE1A, add_zero); // EE3 = EEE1 - EEO1 const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), nShift); // E0 + O0 + rnd [30 20 10 00] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), nShift); // E1 + O1 + rnd [31 21 11 01] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), nShift); // E2 + O2 + rnd [32 22 12 02] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), nShift); // E3 + O3 + rnd [33 23 13 03] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), nShift); // E4 [33 24 14 04] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), nShift); // E5 [35 25 15 05] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), nShift); // E6 [36 26 16 06] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), nShift); // E7 [37 27 17 07] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), nShift); // E7 [30 20 10 00] x8 const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), nShift); // E6 [31 21 11 01] x9 const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), nShift); // E5 [32 22 12 02] xA const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), nShift); // E4 [33 23 13 03] xB const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), nShift); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), nShift); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), nShift); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), nShift); // E0 - O0 + rnd [37 27 17 07] xF res00[0] = _mm_packs_epi32(T30A, add_zero); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T31A, add_zero); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T32A, add_zero); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T33A, add_zero); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T34A, add_zero); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T35A, add_zero); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T36A, add_zero); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T37A, add_zero); // [77 67 57 47 37 27 17 07] res08[0] = _mm_packs_epi32(T38A, add_zero); // [A0 ... 80] res09[0] = _mm_packs_epi32(T39A, add_zero); // [A1 ... 81] res10[0] = _mm_packs_epi32(T3AA, add_zero); // [A2 ... 82] res11[0] = _mm_packs_epi32(T3BA, add_zero); // [A3 ... 83] res12[0] = _mm_packs_epi32(T3CA, add_zero); // [A4 ... 84] res13[0] = _mm_packs_epi32(T3DA, add_zero); // [A5 ... 85] res14[0] = _mm_packs_epi32(T3EA, add_zero); // [A6 ... 86] res15[0] = _mm_packs_epi32(T3FA, add_zero); // [A7 ... 87] } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) #undef TRANSPOSE_8x8_16BIT } nShift = shift2; c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 } //pass=2 { for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_04A = _mm_unpacklo_epi16(in02[part], Zero_8); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in02[part], Zero_8); // [ ] //0 8 const __m128i T_00_07A = _mm_unpacklo_epi16(in00[part], Zero_8); // [83 03 82 02 81 01 81 00] row08 row00 const __m128i T_00_07B = _mm_unpackhi_epi16(in00[part], Zero_8); // [87 07 86 06 85 05 84 04] __m128i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m128i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m128i EO0A, EO1A, EO2A, EO3A; __m128i EO0B, EO1B, EO2B, EO3B; __m128i EEE0A, EEE1A; __m128i EEE0B, EEE1B; //1 3 5 7 O0A = _mm_madd_epi16(T_00_00A, c16_p43_p45); O1A = _mm_madd_epi16(T_00_00A, c16_p29_p43); O2A = _mm_madd_epi16(T_00_00A, c16_p04_p40); O3A = _mm_madd_epi16(T_00_00A, c16_n21_p35); O4A = _mm_madd_epi16(T_00_00A, c16_n40_p29); O5A = _mm_madd_epi16(T_00_00A, c16_n45_p21); O6A = _mm_madd_epi16(T_00_00A, c16_n35_p13); O7A = _mm_madd_epi16(T_00_00A, c16_n13_p04); O0B = _mm_madd_epi16(T_00_00B, c16_p43_p45); O1B = _mm_madd_epi16(T_00_00B, c16_p29_p43); O2B = _mm_madd_epi16(T_00_00B, c16_p04_p40); O3B = _mm_madd_epi16(T_00_00B, c16_n21_p35); O4B = _mm_madd_epi16(T_00_00B, c16_n40_p29); O5B = _mm_madd_epi16(T_00_00B, c16_n45_p21); O6B = _mm_madd_epi16(T_00_00B, c16_n35_p13); O7B = _mm_madd_epi16(T_00_00B, c16_n13_p04); //2 6 EO0A = _mm_madd_epi16(T_00_04A, c16_p38_p44); // EO0 EO0B = _mm_madd_epi16(T_00_04B, c16_p38_p44); EO1A = _mm_madd_epi16(T_00_04A, c16_n09_p38); // EO1 EO1B = _mm_madd_epi16(T_00_04B, c16_n09_p38); EO2A = _mm_madd_epi16(T_00_04A, c16_n44_p25); // EO2 EO2B = _mm_madd_epi16(T_00_04B, c16_n44_p25); EO3A = _mm_madd_epi16(T_00_04A, c16_n25_p09); // EO3 EO3B = _mm_madd_epi16(T_00_04B, c16_n25_p09); //0 8 EEE0A = _mm_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm_madd_epi16(T_00_07B, c16_n32_p32); { const __m128i EE0A = _mm_add_epi32(EEE0A, add_zero); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, add_zero); const __m128i EE1A = _mm_add_epi32(EEE1A, add_zero); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, add_zero); const __m128i EE3A = _mm_sub_epi32(EEE0A, add_zero); // EE2 = EEE0 - EEO0 const __m128i EE3B = _mm_sub_epi32(EEE0B, add_zero); const __m128i EE2A = _mm_sub_epi32(EEE1A, add_zero); // EE3 = EEE1 - EEO1 const __m128i EE2B = _mm_sub_epi32(EEE1B, add_zero); const __m128i T10A = _mm_add_epi32(_mm_add_epi32(EE0A, EO0A), c32_rnd); // E0 (= EE0 + EO0) + rnd const __m128i T10B = _mm_add_epi32(_mm_add_epi32(EE0B, EO0B), c32_rnd); const __m128i T11A = _mm_add_epi32(_mm_add_epi32(EE1A, EO1A), c32_rnd); // E1 (= EE1 + EO1) + rnd const __m128i T11B = _mm_add_epi32(_mm_add_epi32(EE1B, EO1B), c32_rnd); const __m128i T12A = _mm_add_epi32(_mm_add_epi32(EE2A, EO2A), c32_rnd); // E2 (= EE2 + EO2) + rnd const __m128i T12B = _mm_add_epi32(_mm_add_epi32(EE2B, EO2B), c32_rnd); const __m128i T13A = _mm_add_epi32(_mm_add_epi32(EE3A, EO3A), c32_rnd); // E3 (= EE3 + EO3) + rnd const __m128i T13B = _mm_add_epi32(_mm_add_epi32(EE3B, EO3B), c32_rnd); const __m128i T14A = _mm_add_epi32(_mm_sub_epi32(EE3A, EO3A), c32_rnd); // E4 (= EE3 - EO3) + rnd const __m128i T14B = _mm_add_epi32(_mm_sub_epi32(EE3B, EO3B), c32_rnd); const __m128i T15A = _mm_add_epi32(_mm_sub_epi32(EE2A, EO2A), c32_rnd); // E5 (= EE2 - EO2) + rnd const __m128i T15B = _mm_add_epi32(_mm_sub_epi32(EE2B, EO2B), c32_rnd); const __m128i T16A = _mm_add_epi32(_mm_sub_epi32(EE1A, EO1A), c32_rnd); // E6 (= EE1 - EO1) + rnd const __m128i T16B = _mm_add_epi32(_mm_sub_epi32(EE1B, EO1B), c32_rnd); const __m128i T17A = _mm_add_epi32(_mm_sub_epi32(EE0A, EO0A), c32_rnd); // E7 (= EE0 - EO0) + rnd const __m128i T17B = _mm_add_epi32(_mm_sub_epi32(EE0B, EO0B), c32_rnd); const __m128i T30A = _mm_srai_epi32(_mm_add_epi32(T10A, O0A), nShift); // E0 + O0 + rnd [30 20 10 00] const __m128i T30B = _mm_srai_epi32(_mm_add_epi32(T10B, O0B), nShift); // [70 60 50 40] const __m128i T31A = _mm_srai_epi32(_mm_add_epi32(T11A, O1A), nShift); // E1 + O1 + rnd [31 21 11 01] const __m128i T31B = _mm_srai_epi32(_mm_add_epi32(T11B, O1B), nShift); // [71 61 51 41] const __m128i T32A = _mm_srai_epi32(_mm_add_epi32(T12A, O2A), nShift); // E2 + O2 + rnd [32 22 12 02] const __m128i T32B = _mm_srai_epi32(_mm_add_epi32(T12B, O2B), nShift); // [72 62 52 42] const __m128i T33A = _mm_srai_epi32(_mm_add_epi32(T13A, O3A), nShift); // E3 + O3 + rnd [33 23 13 03] const __m128i T33B = _mm_srai_epi32(_mm_add_epi32(T13B, O3B), nShift); // [73 63 53 43] const __m128i T34A = _mm_srai_epi32(_mm_add_epi32(T14A, O4A), nShift); // E4 [33 24 14 04] const __m128i T34B = _mm_srai_epi32(_mm_add_epi32(T14B, O4B), nShift); // [74 64 54 44] const __m128i T35A = _mm_srai_epi32(_mm_add_epi32(T15A, O5A), nShift); // E5 [35 25 15 05] const __m128i T35B = _mm_srai_epi32(_mm_add_epi32(T15B, O5B), nShift); // [75 65 55 45] const __m128i T36A = _mm_srai_epi32(_mm_add_epi32(T16A, O6A), nShift); // E6 [36 26 16 06] const __m128i T36B = _mm_srai_epi32(_mm_add_epi32(T16B, O6B), nShift); // [76 66 56 46] const __m128i T37A = _mm_srai_epi32(_mm_add_epi32(T17A, O7A), nShift); // E7 [37 27 17 07] const __m128i T37B = _mm_srai_epi32(_mm_add_epi32(T17B, O7B), nShift); // [77 67 57 47] const __m128i T38A = _mm_srai_epi32(_mm_sub_epi32(T17A, O7A), nShift); // E7 [30 20 10 00] x8 const __m128i T38B = _mm_srai_epi32(_mm_sub_epi32(T17B, O7B), nShift); // [70 60 50 40] const __m128i T39A = _mm_srai_epi32(_mm_sub_epi32(T16A, O6A), nShift); // E6 [31 21 11 01] x9 const __m128i T39B = _mm_srai_epi32(_mm_sub_epi32(T16B, O6B), nShift); // [71 61 51 41] const __m128i T3AA = _mm_srai_epi32(_mm_sub_epi32(T15A, O5A), nShift); // E5 [32 22 12 02] xA const __m128i T3AB = _mm_srai_epi32(_mm_sub_epi32(T15B, O5B), nShift); // [72 62 52 42] const __m128i T3BA = _mm_srai_epi32(_mm_sub_epi32(T14A, O4A), nShift); // E4 [33 23 13 03] xB const __m128i T3BB = _mm_srai_epi32(_mm_sub_epi32(T14B, O4B), nShift); // [73 63 53 43] const __m128i T3CA = _mm_srai_epi32(_mm_sub_epi32(T13A, O3A), nShift); // E3 - O3 + rnd [33 24 14 04] xC const __m128i T3CB = _mm_srai_epi32(_mm_sub_epi32(T13B, O3B), nShift); // [74 64 54 44] const __m128i T3DA = _mm_srai_epi32(_mm_sub_epi32(T12A, O2A), nShift); // E2 - O2 + rnd [35 25 15 05] xD const __m128i T3DB = _mm_srai_epi32(_mm_sub_epi32(T12B, O2B), nShift); // [75 65 55 45] const __m128i T3EA = _mm_srai_epi32(_mm_sub_epi32(T11A, O1A), nShift); // E1 - O1 + rnd [36 26 16 06] xE const __m128i T3EB = _mm_srai_epi32(_mm_sub_epi32(T11B, O1B), nShift); // [76 66 56 46] const __m128i T3FA = _mm_srai_epi32(_mm_sub_epi32(T10A, O0A), nShift); // E0 - O0 + rnd [37 27 17 07] xF const __m128i T3FB = _mm_srai_epi32(_mm_sub_epi32(T10B, O0B), nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T38A, T38B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T39A, T39B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3FA, T3FB); // [A7 ... 87] } } // transpose matrix 8x8 16bit { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) #undef TRANSPOSE_8x8_16BIT } } // clip { const __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); const __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); in00[0] = _mm_max_epi16(_mm_min_epi16(in00[0], max_val), min_val); in00[1] = _mm_max_epi16(_mm_min_epi16(in00[1], max_val), min_val); in01[0] = _mm_max_epi16(_mm_min_epi16(in01[0], max_val), min_val); in01[1] = _mm_max_epi16(_mm_min_epi16(in01[1], max_val), min_val); in02[0] = _mm_max_epi16(_mm_min_epi16(in02[0], max_val), min_val); in02[1] = _mm_max_epi16(_mm_min_epi16(in02[1], max_val), min_val); in03[0] = _mm_max_epi16(_mm_min_epi16(in03[0], max_val), min_val); in03[1] = _mm_max_epi16(_mm_min_epi16(in03[1], max_val), min_val); in04[0] = _mm_max_epi16(_mm_min_epi16(in04[0], max_val), min_val); in04[1] = _mm_max_epi16(_mm_min_epi16(in04[1], max_val), min_val); in05[0] = _mm_max_epi16(_mm_min_epi16(in05[0], max_val), min_val); in05[1] = _mm_max_epi16(_mm_min_epi16(in05[1], max_val), min_val); in06[0] = _mm_max_epi16(_mm_min_epi16(in06[0], max_val), min_val); in06[1] = _mm_max_epi16(_mm_min_epi16(in06[1], max_val), min_val); in07[0] = _mm_max_epi16(_mm_min_epi16(in07[0], max_val), min_val); in07[1] = _mm_max_epi16(_mm_min_epi16(in07[1], max_val), min_val); in08[0] = _mm_max_epi16(_mm_min_epi16(in08[0], max_val), min_val); in08[1] = _mm_max_epi16(_mm_min_epi16(in08[1], max_val), min_val); in09[0] = _mm_max_epi16(_mm_min_epi16(in09[0], max_val), min_val); in09[1] = _mm_max_epi16(_mm_min_epi16(in09[1], max_val), min_val); in10[0] = _mm_max_epi16(_mm_min_epi16(in10[0], max_val), min_val); in10[1] = _mm_max_epi16(_mm_min_epi16(in10[1], max_val), min_val); in11[0] = _mm_max_epi16(_mm_min_epi16(in11[0], max_val), min_val); in11[1] = _mm_max_epi16(_mm_min_epi16(in11[1], max_val), min_val); in12[0] = _mm_max_epi16(_mm_min_epi16(in12[0], max_val), min_val); in12[1] = _mm_max_epi16(_mm_min_epi16(in12[1], max_val), min_val); in13[0] = _mm_max_epi16(_mm_min_epi16(in13[0], max_val), min_val); in13[1] = _mm_max_epi16(_mm_min_epi16(in13[1], max_val), min_val); in14[0] = _mm_max_epi16(_mm_min_epi16(in14[0], max_val), min_val); in14[1] = _mm_max_epi16(_mm_min_epi16(in14[1], max_val), min_val); in15[0] = _mm_max_epi16(_mm_min_epi16(in15[0], max_val), min_val); in15[1] = _mm_max_epi16(_mm_min_epi16(in15[1], max_val), min_val); } // store _mm_store_si128((__m128i*)(dst + 0 * i_dst + 0), in00[0]); _mm_store_si128((__m128i*)(dst + 0 * i_dst + 8), in00[1]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 0), in01[0]); _mm_store_si128((__m128i*)(dst + 1 * i_dst + 8), in01[1]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 0), in02[0]); _mm_store_si128((__m128i*)(dst + 2 * i_dst + 8), in02[1]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 0), in03[0]); _mm_store_si128((__m128i*)(dst + 3 * i_dst + 8), in03[1]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 0), in04[0]); _mm_store_si128((__m128i*)(dst + 4 * i_dst + 8), in04[1]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 0), in05[0]); _mm_store_si128((__m128i*)(dst + 5 * i_dst + 8), in05[1]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 0), in06[0]); _mm_store_si128((__m128i*)(dst + 6 * i_dst + 8), in06[1]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 0), in07[0]); _mm_store_si128((__m128i*)(dst + 7 * i_dst + 8), in07[1]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 0), in08[0]); _mm_store_si128((__m128i*)(dst + 8 * i_dst + 8), in08[1]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 0), in09[0]); _mm_store_si128((__m128i*)(dst + 9 * i_dst + 8), in09[1]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 0), in10[0]); _mm_store_si128((__m128i*)(dst + 10 * i_dst + 8), in10[1]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 0), in11[0]); _mm_store_si128((__m128i*)(dst + 11 * i_dst + 8), in11[1]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 0), in12[0]); _mm_store_si128((__m128i*)(dst + 12 * i_dst + 8), in12[1]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 0), in13[0]); _mm_store_si128((__m128i*)(dst + 13 * i_dst + 8), in13[1]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 0), in14[0]); _mm_store_si128((__m128i*)(dst + 14 * i_dst + 8), in14[1]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 0), in15[0]); _mm_store_si128((__m128i*)(dst + 15 * i_dst + 8), in15[1]); } /* --------------------------------------------------------------------------- */ void idct_32x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { int a_flag = i_dst & 0x01; //int shift1 = 5; int shift2 = 20 - g_bit_depth - a_flag; //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + a_flag; const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); // add1 int nShift = 5; int i, pass, part; // DCT1 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4]; __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4]; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; i_dst &= 0xFE; /* remember to remove the flag bit */ for (i = 0; i < 4; i++) { const int offset = (i << 3); in00[i] = _mm_loadu_si128((const __m128i*)&src[ 0 * 32 + offset]); in01[i] = _mm_loadu_si128((const __m128i*)&src[ 1 * 32 + offset]); in02[i] = _mm_loadu_si128((const __m128i*)&src[ 2 * 32 + offset]); in03[i] = _mm_loadu_si128((const __m128i*)&src[ 3 * 32 + offset]); in04[i] = _mm_loadu_si128((const __m128i*)&src[ 4 * 32 + offset]); in05[i] = _mm_loadu_si128((const __m128i*)&src[ 5 * 32 + offset]); in06[i] = _mm_loadu_si128((const __m128i*)&src[ 6 * 32 + offset]); in07[i] = _mm_loadu_si128((const __m128i*)&src[ 7 * 32 + offset]); in08[i] = _mm_loadu_si128((const __m128i*)&src[ 8 * 32 + offset]); in09[i] = _mm_loadu_si128((const __m128i*)&src[ 9 * 32 + offset]); in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); in16[i] = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]); in17[i] = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]); in18[i] = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]); in19[i] = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]); in20[i] = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]); in21[i] = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]); in22[i] = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]); in23[i] = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]); in24[i] = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]); in25[i] = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]); in26[i] = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]); in27[i] = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]); in28[i] = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]); in29[i] = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]); in30[i] = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]); in31[i] = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]); } for (pass = 0; pass < 2; pass++) { if (pass == 1) { c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; } for (part = 0; part < 4; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in17[part], in19[part]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in17[part], in19[part]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in21[part], in23[part]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in21[part], in23[part]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in25[part], in27[part]); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(in25[part], in27[part]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in29[part], in31[part]); // const __m128i T_00_07B = _mm_unpackhi_epi16(in29[part], in31[part]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(in18[part], in22[part]); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(in18[part], in22[part]); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(in26[part], in30[part]); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(in26[part], in30[part]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(in20[part], in28[part]); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(in20[part], in28[part]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], in24[part]); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], in24[part]); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], in16[part]); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], in16[part]); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m128i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm_packs_epi32(T3_16A, T3_16B); res17[part] = _mm_packs_epi32(T3_17A, T3_17B); res18[part] = _mm_packs_epi32(T3_18A, T3_18B); res19[part] = _mm_packs_epi32(T3_19A, T3_19B); res20[part] = _mm_packs_epi32(T3_20A, T3_20B); res21[part] = _mm_packs_epi32(T3_21A, T3_21B); res22[part] = _mm_packs_epi32(T3_22A, T3_22B); res23[part] = _mm_packs_epi32(T3_23A, T3_23B); res24[part] = _mm_packs_epi32(T3_24A, T3_24B); res25[part] = _mm_packs_epi32(T3_25A, T3_25B); res26[part] = _mm_packs_epi32(T3_26A, T3_26B); res27[part] = _mm_packs_epi32(T3_27A, T3_27B); res28[part] = _mm_packs_epi32(T3_28A, T3_28B); res29[part] = _mm_packs_epi32(T3_29A, T3_29B); res30[part] = _mm_packs_epi32(T3_30A, T3_30B); res31[part] = _mm_packs_epi32(T3_31A, T3_31B); } } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0]) TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1]) TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2]) TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3]) TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3]) #undef TRANSPOSE_8x8_16BIT } } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); int k; for (k = 0; k < 4; k++) { in00[k] = _mm_max_epi16(_mm_min_epi16(in00[k], max_val), min_val); in01[k] = _mm_max_epi16(_mm_min_epi16(in01[k], max_val), min_val); in02[k] = _mm_max_epi16(_mm_min_epi16(in02[k], max_val), min_val); in03[k] = _mm_max_epi16(_mm_min_epi16(in03[k], max_val), min_val); in04[k] = _mm_max_epi16(_mm_min_epi16(in04[k], max_val), min_val); in05[k] = _mm_max_epi16(_mm_min_epi16(in05[k], max_val), min_val); in06[k] = _mm_max_epi16(_mm_min_epi16(in06[k], max_val), min_val); in07[k] = _mm_max_epi16(_mm_min_epi16(in07[k], max_val), min_val); in08[k] = _mm_max_epi16(_mm_min_epi16(in08[k], max_val), min_val); in09[k] = _mm_max_epi16(_mm_min_epi16(in09[k], max_val), min_val); in10[k] = _mm_max_epi16(_mm_min_epi16(in10[k], max_val), min_val); in11[k] = _mm_max_epi16(_mm_min_epi16(in11[k], max_val), min_val); in12[k] = _mm_max_epi16(_mm_min_epi16(in12[k], max_val), min_val); in13[k] = _mm_max_epi16(_mm_min_epi16(in13[k], max_val), min_val); in14[k] = _mm_max_epi16(_mm_min_epi16(in14[k], max_val), min_val); in15[k] = _mm_max_epi16(_mm_min_epi16(in15[k], max_val), min_val); in16[k] = _mm_max_epi16(_mm_min_epi16(in16[k], max_val), min_val); in17[k] = _mm_max_epi16(_mm_min_epi16(in17[k], max_val), min_val); in18[k] = _mm_max_epi16(_mm_min_epi16(in18[k], max_val), min_val); in19[k] = _mm_max_epi16(_mm_min_epi16(in19[k], max_val), min_val); in20[k] = _mm_max_epi16(_mm_min_epi16(in20[k], max_val), min_val); in21[k] = _mm_max_epi16(_mm_min_epi16(in21[k], max_val), min_val); in22[k] = _mm_max_epi16(_mm_min_epi16(in22[k], max_val), min_val); in23[k] = _mm_max_epi16(_mm_min_epi16(in23[k], max_val), min_val); in24[k] = _mm_max_epi16(_mm_min_epi16(in24[k], max_val), min_val); in25[k] = _mm_max_epi16(_mm_min_epi16(in25[k], max_val), min_val); in26[k] = _mm_max_epi16(_mm_min_epi16(in26[k], max_val), min_val); in27[k] = _mm_max_epi16(_mm_min_epi16(in27[k], max_val), min_val); in28[k] = _mm_max_epi16(_mm_min_epi16(in28[k], max_val), min_val); in29[k] = _mm_max_epi16(_mm_min_epi16(in29[k], max_val), min_val); in30[k] = _mm_max_epi16(_mm_min_epi16(in30[k], max_val), min_val); in31[k] = _mm_max_epi16(_mm_min_epi16(in31[k], max_val), min_val); } } // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+0), L0); \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+8), H0); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+0), L1); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+8), H1); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+0), L2); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+8), H2); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+0), L3); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+8), H3); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+0), L4); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+8), H4); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+0), L5); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+8), H5); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+0), L6); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+8), H6); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+0), L7); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+8), H7); const int k = i * 2; STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16) STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16) STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16) STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_32x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ16x16зϵ int a_flag = i_dst & 0x01; int shift2 = 20 - g_bit_depth - a_flag; int clip_depth2 = g_bit_depth + 1 + a_flag; const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); // add1 __m128i Zero_16 = _mm_set1_epi16(0); int nShift = 5; int i, part; // DCT1 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4]; __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4]; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; i_dst &= 0xFE; /* remember to remove the flag bit */ for (i = 0; i < 2; i++) { const int offset = (i << 3); in00[i] = _mm_loadu_si128((const __m128i*)&src[ 0 * 32 + offset]); in01[i] = _mm_loadu_si128((const __m128i*)&src[ 1 * 32 + offset]); in02[i] = _mm_loadu_si128((const __m128i*)&src[ 2 * 32 + offset]); in03[i] = _mm_loadu_si128((const __m128i*)&src[ 3 * 32 + offset]); in04[i] = _mm_loadu_si128((const __m128i*)&src[ 4 * 32 + offset]); in05[i] = _mm_loadu_si128((const __m128i*)&src[ 5 * 32 + offset]); in06[i] = _mm_loadu_si128((const __m128i*)&src[ 6 * 32 + offset]); in07[i] = _mm_loadu_si128((const __m128i*)&src[ 7 * 32 + offset]); in08[i] = _mm_loadu_si128((const __m128i*)&src[ 8 * 32 + offset]); in09[i] = _mm_loadu_si128((const __m128i*)&src[ 9 * 32 + offset]); in10[i] = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]); in11[i] = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]); in12[i] = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]); in13[i] = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]); in14[i] = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]); in15[i] = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]); } //pass=1 for (part = 0; part < 2; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], Zero_16); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], Zero_16); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], Zero_16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], Zero_16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, O15B) #undef COMPUTE_ROW } EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p43_p45), _mm_madd_epi16(T_00_09A, c16_p35_p40)); EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p29_p43), _mm_madd_epi16(T_00_09A, c16_n21_p04)); EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p04_p40), _mm_madd_epi16(T_00_09A, c16_n43_n35)); EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n21_p35), _mm_madd_epi16(T_00_09A, c16_p04_n43)); EO4A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n40_p29), _mm_madd_epi16(T_00_09A, c16_p45_n13)); EO5A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n45_p21), _mm_madd_epi16(T_00_09A, c16_p13_p29)); EO6A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n35_p13), _mm_madd_epi16(T_00_09A, c16_n40_p45)); EO7A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n13_p04), _mm_madd_epi16(T_00_09A, c16_n29_p21)); EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p43_p45), _mm_madd_epi16(T_00_09B, c16_p35_p40)); EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p29_p43), _mm_madd_epi16(T_00_09B, c16_n21_p04)); EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p04_p40), _mm_madd_epi16(T_00_09B, c16_n43_n35)); EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n21_p35), _mm_madd_epi16(T_00_09B, c16_p04_n43)); EO4B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n40_p29), _mm_madd_epi16(T_00_09B, c16_p45_n13)); EO5B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n45_p21), _mm_madd_epi16(T_00_09B, c16_p13_p29)); EO6B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n35_p13), _mm_madd_epi16(T_00_09B, c16_n40_p45)); EO7B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n13_p04), _mm_madd_epi16(T_00_09B, c16_n29_p21)); { const __m128i EEO0A = _mm_madd_epi16(T_00_12A, c16_p38_p44); const __m128i EEO1A = _mm_madd_epi16(T_00_12A, c16_n09_p38); const __m128i EEO2A = _mm_madd_epi16(T_00_12A, c16_n44_p25); const __m128i EEO3A = _mm_madd_epi16(T_00_12A, c16_n25_p09); const __m128i EEO0B = _mm_madd_epi16(T_00_12B, c16_p38_p44); const __m128i EEO1B = _mm_madd_epi16(T_00_12B, c16_n09_p38); const __m128i EEO2B = _mm_madd_epi16(T_00_12B, c16_n44_p25); const __m128i EEO3B = _mm_madd_epi16(T_00_12B, c16_n25_p09); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm_packs_epi32(T3_16A, T3_16B); res17[part] = _mm_packs_epi32(T3_17A, T3_17B); res18[part] = _mm_packs_epi32(T3_18A, T3_18B); res19[part] = _mm_packs_epi32(T3_19A, T3_19B); res20[part] = _mm_packs_epi32(T3_20A, T3_20B); res21[part] = _mm_packs_epi32(T3_21A, T3_21B); res22[part] = _mm_packs_epi32(T3_22A, T3_22B); res23[part] = _mm_packs_epi32(T3_23A, T3_23B); res24[part] = _mm_packs_epi32(T3_24A, T3_24B); res25[part] = _mm_packs_epi32(T3_25A, T3_25B); res26[part] = _mm_packs_epi32(T3_26A, T3_26B); res27[part] = _mm_packs_epi32(T3_27A, T3_27B); res28[part] = _mm_packs_epi32(T3_28A, T3_28B); res29[part] = _mm_packs_epi32(T3_29A, T3_29B); res30[part] = _mm_packs_epi32(T3_30A, T3_30B); res31[part] = _mm_packs_epi32(T3_31A, T3_31B); } } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) #undef TRANSPOSE_8x8_16BIT } //pass=2 c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; for (part = 0; part < 4; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08[part], Zero_16); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08[part], Zero_16); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], Zero_16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], Zero_16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, O15B) #undef COMPUTE_ROW } EO0A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p43_p45), _mm_madd_epi16(T_00_09A, c16_p35_p40)); EO1A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p29_p43), _mm_madd_epi16(T_00_09A, c16_n21_p04)); EO2A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_p04_p40), _mm_madd_epi16(T_00_09A, c16_n43_n35)); EO3A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n21_p35), _mm_madd_epi16(T_00_09A, c16_p04_n43)); EO4A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n40_p29), _mm_madd_epi16(T_00_09A, c16_p45_n13)); EO5A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n45_p21), _mm_madd_epi16(T_00_09A, c16_p13_p29)); EO6A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n35_p13), _mm_madd_epi16(T_00_09A, c16_n40_p45)); EO7A = _mm_add_epi32(_mm_madd_epi16(T_00_08A, c16_n13_p04), _mm_madd_epi16(T_00_09A, c16_n29_p21)); EO0B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p43_p45), _mm_madd_epi16(T_00_09B, c16_p35_p40)); EO1B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p29_p43), _mm_madd_epi16(T_00_09B, c16_n21_p04)); EO2B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_p04_p40), _mm_madd_epi16(T_00_09B, c16_n43_n35)); EO3B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n21_p35), _mm_madd_epi16(T_00_09B, c16_p04_n43)); EO4B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n40_p29), _mm_madd_epi16(T_00_09B, c16_p45_n13)); EO5B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n45_p21), _mm_madd_epi16(T_00_09B, c16_p13_p29)); EO6B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n35_p13), _mm_madd_epi16(T_00_09B, c16_n40_p45)); EO7B = _mm_add_epi32(_mm_madd_epi16(T_00_08B, c16_n13_p04), _mm_madd_epi16(T_00_09B, c16_n29_p21)); { const __m128i EEO0A = _mm_madd_epi16(T_00_12A, c16_p38_p44); const __m128i EEO1A = _mm_madd_epi16(T_00_12A, c16_n09_p38); const __m128i EEO2A = _mm_madd_epi16(T_00_12A, c16_n44_p25); const __m128i EEO3A = _mm_madd_epi16(T_00_12A, c16_n25_p09); const __m128i EEO0B = _mm_madd_epi16(T_00_12B, c16_p38_p44); const __m128i EEO1B = _mm_madd_epi16(T_00_12B, c16_n09_p38); const __m128i EEO2B = _mm_madd_epi16(T_00_12B, c16_n44_p25); const __m128i EEO3B = _mm_madd_epi16(T_00_12B, c16_n25_p09); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm_packs_epi32(T3_16A, T3_16B); res17[part] = _mm_packs_epi32(T3_17A, T3_17B); res18[part] = _mm_packs_epi32(T3_18A, T3_18B); res19[part] = _mm_packs_epi32(T3_19A, T3_19B); res20[part] = _mm_packs_epi32(T3_20A, T3_20B); res21[part] = _mm_packs_epi32(T3_21A, T3_21B); res22[part] = _mm_packs_epi32(T3_22A, T3_22B); res23[part] = _mm_packs_epi32(T3_23A, T3_23B); res24[part] = _mm_packs_epi32(T3_24A, T3_24B); res25[part] = _mm_packs_epi32(T3_25A, T3_25B); res26[part] = _mm_packs_epi32(T3_26A, T3_26B); res27[part] = _mm_packs_epi32(T3_27A, T3_27B); res28[part] = _mm_packs_epi32(T3_28A, T3_28B); res29[part] = _mm_packs_epi32(T3_29A, T3_29B); res30[part] = _mm_packs_epi32(T3_30A, T3_30B); res31[part] = _mm_packs_epi32(T3_31A, T3_31B); } } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0]) TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1]) TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2]) TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3]) TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3]) #undef TRANSPOSE_8x8_16BIT } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); int k; for (k = 0; k < 4; k++) { in00[k] = _mm_max_epi16(_mm_min_epi16(in00[k], max_val), min_val); in01[k] = _mm_max_epi16(_mm_min_epi16(in01[k], max_val), min_val); in02[k] = _mm_max_epi16(_mm_min_epi16(in02[k], max_val), min_val); in03[k] = _mm_max_epi16(_mm_min_epi16(in03[k], max_val), min_val); in04[k] = _mm_max_epi16(_mm_min_epi16(in04[k], max_val), min_val); in05[k] = _mm_max_epi16(_mm_min_epi16(in05[k], max_val), min_val); in06[k] = _mm_max_epi16(_mm_min_epi16(in06[k], max_val), min_val); in07[k] = _mm_max_epi16(_mm_min_epi16(in07[k], max_val), min_val); in08[k] = _mm_max_epi16(_mm_min_epi16(in08[k], max_val), min_val); in09[k] = _mm_max_epi16(_mm_min_epi16(in09[k], max_val), min_val); in10[k] = _mm_max_epi16(_mm_min_epi16(in10[k], max_val), min_val); in11[k] = _mm_max_epi16(_mm_min_epi16(in11[k], max_val), min_val); in12[k] = _mm_max_epi16(_mm_min_epi16(in12[k], max_val), min_val); in13[k] = _mm_max_epi16(_mm_min_epi16(in13[k], max_val), min_val); in14[k] = _mm_max_epi16(_mm_min_epi16(in14[k], max_val), min_val); in15[k] = _mm_max_epi16(_mm_min_epi16(in15[k], max_val), min_val); in16[k] = _mm_max_epi16(_mm_min_epi16(in16[k], max_val), min_val); in17[k] = _mm_max_epi16(_mm_min_epi16(in17[k], max_val), min_val); in18[k] = _mm_max_epi16(_mm_min_epi16(in18[k], max_val), min_val); in19[k] = _mm_max_epi16(_mm_min_epi16(in19[k], max_val), min_val); in20[k] = _mm_max_epi16(_mm_min_epi16(in20[k], max_val), min_val); in21[k] = _mm_max_epi16(_mm_min_epi16(in21[k], max_val), min_val); in22[k] = _mm_max_epi16(_mm_min_epi16(in22[k], max_val), min_val); in23[k] = _mm_max_epi16(_mm_min_epi16(in23[k], max_val), min_val); in24[k] = _mm_max_epi16(_mm_min_epi16(in24[k], max_val), min_val); in25[k] = _mm_max_epi16(_mm_min_epi16(in25[k], max_val), min_val); in26[k] = _mm_max_epi16(_mm_min_epi16(in26[k], max_val), min_val); in27[k] = _mm_max_epi16(_mm_min_epi16(in27[k], max_val), min_val); in28[k] = _mm_max_epi16(_mm_min_epi16(in28[k], max_val), min_val); in29[k] = _mm_max_epi16(_mm_min_epi16(in29[k], max_val), min_val); in30[k] = _mm_max_epi16(_mm_min_epi16(in30[k], max_val), min_val); in31[k] = _mm_max_epi16(_mm_min_epi16(in31[k], max_val), min_val); } } // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+0), L0); \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+8), H0); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+0), L1); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+8), H1); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+0), L2); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+8), H2); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+0), L3); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+8), H3); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+0), L4); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+8), H4); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+0), L5); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+8), H5); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+0), L6); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+8), H6); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+0), L7); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+8), H7); const int k = i * 2; STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16) STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16) STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16) STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_32x32_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ8x8зϵ int a_flag = i_dst & 0x01; int shift2 = 20 - g_bit_depth - a_flag; int clip_depth2 = g_bit_depth + 1 + a_flag; const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); // add1 __m128i Zero_16 = _mm_set1_epi16(0); int nShift = 5; int i, part; // DCT1 __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4]; __m128i in16[4], in17[4], in18[4], in19[4], in20[4], in21[4], in22[4], in23[4], in24[4], in25[4], in26[4], in27[4], in28[4], in29[4], in30[4], in31[4]; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; i_dst &= 0xFE; /* remember to remove the flag bit */ in00[0] = _mm_loadu_si128((const __m128i*)&src[0 * 32]); in01[0] = _mm_loadu_si128((const __m128i*)&src[1 * 32]); in02[0] = _mm_loadu_si128((const __m128i*)&src[2 * 32]); in03[0] = _mm_loadu_si128((const __m128i*)&src[3 * 32]); in04[0] = _mm_loadu_si128((const __m128i*)&src[4 * 32]); in05[0] = _mm_loadu_si128((const __m128i*)&src[5 * 32]); in06[0] = _mm_loadu_si128((const __m128i*)&src[6 * 32]); in07[0] = _mm_loadu_si128((const __m128i*)&src[7 * 32]); //pass=1 const __m128i T_00_00A = _mm_unpacklo_epi16(in01[0], in03[0]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[0], in03[0]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[0], in07[0]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[0], in07[0]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[0], in06[0]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[0], in06[0]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[0], Zero_16); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[0], Zero_16); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[0], Zero_16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[0], Zero_16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; O00A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p45_p45), _mm_madd_epi16(T_00_01A, c16_p43_p44)); O01A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p41_p45), _mm_madd_epi16(T_00_01A, c16_p23_p34)); O02A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p34_p44), _mm_madd_epi16(T_00_01A, c16_n07_p15)); O03A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p23_p43), _mm_madd_epi16(T_00_01A, c16_n34_n07)); O04A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p11_p41), _mm_madd_epi16(T_00_01A, c16_n45_n27)); O05A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n02_p39), _mm_madd_epi16(T_00_01A, c16_n36_n41)); O06A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n15_p36), _mm_madd_epi16(T_00_01A, c16_n11_n45)); O07A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n27_p34), _mm_madd_epi16(T_00_01A, c16_p19_n39)); O08A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n36_p30), _mm_madd_epi16(T_00_01A, c16_p41_n23)); O09A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n43_p27), _mm_madd_epi16(T_00_01A, c16_p44_n02)); O10A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n45_p23), _mm_madd_epi16(T_00_01A, c16_p27_p19)); O11A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n44_p19), _mm_madd_epi16(T_00_01A, c16_n02_p36)); O12A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n39_p15), _mm_madd_epi16(T_00_01A, c16_n30_p45)); O13A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n30_p11), _mm_madd_epi16(T_00_01A, c16_n45_p43)); O14A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n19_p07), _mm_madd_epi16(T_00_01A, c16_n39_p30)); O15A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n07_p02), _mm_madd_epi16(T_00_01A, c16_n15_p11)); O00B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p45_p45), _mm_madd_epi16(T_00_01B, c16_p43_p44)); O01B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p41_p45), _mm_madd_epi16(T_00_01B, c16_p23_p34)); O02B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p34_p44), _mm_madd_epi16(T_00_01B, c16_n07_p15)); O03B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p23_p43), _mm_madd_epi16(T_00_01B, c16_n34_n07)); O04B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p11_p41), _mm_madd_epi16(T_00_01B, c16_n45_n27)); O05B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n02_p39), _mm_madd_epi16(T_00_01B, c16_n36_n41)); O06B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n15_p36), _mm_madd_epi16(T_00_01B, c16_n11_n45)); O07B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n27_p34), _mm_madd_epi16(T_00_01B, c16_p19_n39)); O08B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n36_p30), _mm_madd_epi16(T_00_01B, c16_p41_n23)); O09B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n43_p27), _mm_madd_epi16(T_00_01B, c16_p44_n02)); O10B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n45_p23), _mm_madd_epi16(T_00_01B, c16_p27_p19)); O11B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n44_p19), _mm_madd_epi16(T_00_01B, c16_n02_p36)); O12B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n39_p15), _mm_madd_epi16(T_00_01B, c16_n30_p45)); O13B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n30_p11), _mm_madd_epi16(T_00_01B, c16_n45_p43)); O14B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n19_p07), _mm_madd_epi16(T_00_01B, c16_n39_p30)); O15B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n07_p02), _mm_madd_epi16(T_00_01B, c16_n15_p11)); EO0A = _mm_madd_epi16(T_00_08A, c16_p43_p45); EO1A = _mm_madd_epi16(T_00_08A, c16_p29_p43); EO2A = _mm_madd_epi16(T_00_08A, c16_p04_p40); EO3A = _mm_madd_epi16(T_00_08A, c16_n21_p35); EO4A = _mm_madd_epi16(T_00_08A, c16_n40_p29); EO5A = _mm_madd_epi16(T_00_08A, c16_n45_p21); EO6A = _mm_madd_epi16(T_00_08A, c16_n35_p13); EO7A = _mm_madd_epi16(T_00_08A, c16_n13_p04); EO0B = _mm_madd_epi16(T_00_08B, c16_p43_p45); EO1B = _mm_madd_epi16(T_00_08B, c16_p29_p43); EO2B = _mm_madd_epi16(T_00_08B, c16_p04_p40); EO3B = _mm_madd_epi16(T_00_08B, c16_n21_p35); EO4B = _mm_madd_epi16(T_00_08B, c16_n40_p29); EO5B = _mm_madd_epi16(T_00_08B, c16_n45_p21); EO6B = _mm_madd_epi16(T_00_08B, c16_n35_p13); EO7B = _mm_madd_epi16(T_00_08B, c16_n13_p04); { const __m128i EEO0A = _mm_madd_epi16(T_00_12A, c16_p38_p44); const __m128i EEO1A = _mm_madd_epi16(T_00_12A, c16_n09_p38); const __m128i EEO2A = _mm_madd_epi16(T_00_12A, c16_n44_p25); const __m128i EEO3A = _mm_madd_epi16(T_00_12A, c16_n25_p09); const __m128i EEO0B = _mm_madd_epi16(T_00_12B, c16_p38_p44); const __m128i EEO1B = _mm_madd_epi16(T_00_12B, c16_n09_p38); const __m128i EEO2B = _mm_madd_epi16(T_00_12B, c16_n44_p25); const __m128i EEO3B = _mm_madd_epi16(T_00_12B, c16_n25_p09); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = EEEE0A; // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = EEEE0B; const __m128i EEE1A = EEEE1A; // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = EEEE1B; const __m128i EEE3A = EEEE0A; // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = EEEE0B; const __m128i EEE2A = EEEE1A; // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = EEEE1B; const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[0] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[0] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[0] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[0] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[0] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[0] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[0] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[0] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[0] = _mm_packs_epi32(T3_16A, T3_16B); res17[0] = _mm_packs_epi32(T3_17A, T3_17B); res18[0] = _mm_packs_epi32(T3_18A, T3_18B); res19[0] = _mm_packs_epi32(T3_19A, T3_19B); res20[0] = _mm_packs_epi32(T3_20A, T3_20B); res21[0] = _mm_packs_epi32(T3_21A, T3_21B); res22[0] = _mm_packs_epi32(T3_22A, T3_22B); res23[0] = _mm_packs_epi32(T3_23A, T3_23B); res24[0] = _mm_packs_epi32(T3_24A, T3_24B); res25[0] = _mm_packs_epi32(T3_25A, T3_25B); res26[0] = _mm_packs_epi32(T3_26A, T3_26B); res27[0] = _mm_packs_epi32(T3_27A, T3_27B); res28[0] = _mm_packs_epi32(T3_28A, T3_28B); res29[0] = _mm_packs_epi32(T3_29A, T3_29B); res30[0] = _mm_packs_epi32(T3_30A, T3_30B); res31[0] = _mm_packs_epi32(T3_31A, T3_31B); } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) #undef TRANSPOSE_8x8_16BIT } //pass=2 c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; for (part = 0; part < 4; part++) { const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04[part], Zero_16); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04[part], Zero_16); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00[part], Zero_16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00[part], Zero_16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; O00A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p45_p45), _mm_madd_epi16(T_00_01A, c16_p43_p44)); O01A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p41_p45), _mm_madd_epi16(T_00_01A, c16_p23_p34)); O02A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p34_p44), _mm_madd_epi16(T_00_01A, c16_n07_p15)); O03A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p23_p43), _mm_madd_epi16(T_00_01A, c16_n34_n07)); O04A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_p11_p41), _mm_madd_epi16(T_00_01A, c16_n45_n27)); O05A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n02_p39), _mm_madd_epi16(T_00_01A, c16_n36_n41)); O06A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n15_p36), _mm_madd_epi16(T_00_01A, c16_n11_n45)); O07A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n27_p34), _mm_madd_epi16(T_00_01A, c16_p19_n39)); O08A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n36_p30), _mm_madd_epi16(T_00_01A, c16_p41_n23)); O09A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n43_p27), _mm_madd_epi16(T_00_01A, c16_p44_n02)); O10A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n45_p23), _mm_madd_epi16(T_00_01A, c16_p27_p19)); O11A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n44_p19), _mm_madd_epi16(T_00_01A, c16_n02_p36)); O12A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n39_p15), _mm_madd_epi16(T_00_01A, c16_n30_p45)); O13A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n30_p11), _mm_madd_epi16(T_00_01A, c16_n45_p43)); O14A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n19_p07), _mm_madd_epi16(T_00_01A, c16_n39_p30)); O15A = _mm_add_epi32(_mm_madd_epi16(T_00_00A, c16_n07_p02), _mm_madd_epi16(T_00_01A, c16_n15_p11)); O00B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p45_p45), _mm_madd_epi16(T_00_01B, c16_p43_p44)); O01B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p41_p45), _mm_madd_epi16(T_00_01B, c16_p23_p34)); O02B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p34_p44), _mm_madd_epi16(T_00_01B, c16_n07_p15)); O03B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p23_p43), _mm_madd_epi16(T_00_01B, c16_n34_n07)); O04B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_p11_p41), _mm_madd_epi16(T_00_01B, c16_n45_n27)); O05B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n02_p39), _mm_madd_epi16(T_00_01B, c16_n36_n41)); O06B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n15_p36), _mm_madd_epi16(T_00_01B, c16_n11_n45)); O07B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n27_p34), _mm_madd_epi16(T_00_01B, c16_p19_n39)); O08B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n36_p30), _mm_madd_epi16(T_00_01B, c16_p41_n23)); O09B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n43_p27), _mm_madd_epi16(T_00_01B, c16_p44_n02)); O10B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n45_p23), _mm_madd_epi16(T_00_01B, c16_p27_p19)); O11B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n44_p19), _mm_madd_epi16(T_00_01B, c16_n02_p36)); O12B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n39_p15), _mm_madd_epi16(T_00_01B, c16_n30_p45)); O13B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n30_p11), _mm_madd_epi16(T_00_01B, c16_n45_p43)); O14B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n19_p07), _mm_madd_epi16(T_00_01B, c16_n39_p30)); O15B = _mm_add_epi32(_mm_madd_epi16(T_00_00B, c16_n07_p02), _mm_madd_epi16(T_00_01B, c16_n15_p11)); EO0A = _mm_madd_epi16(T_00_08A, c16_p43_p45); EO1A = _mm_madd_epi16(T_00_08A, c16_p29_p43); EO2A = _mm_madd_epi16(T_00_08A, c16_p04_p40); EO3A = _mm_madd_epi16(T_00_08A, c16_n21_p35); EO4A = _mm_madd_epi16(T_00_08A, c16_n40_p29); EO5A = _mm_madd_epi16(T_00_08A, c16_n45_p21); EO6A = _mm_madd_epi16(T_00_08A, c16_n35_p13); EO7A = _mm_madd_epi16(T_00_08A, c16_n13_p04); EO0B = _mm_madd_epi16(T_00_08B, c16_p43_p45); EO1B = _mm_madd_epi16(T_00_08B, c16_p29_p43); EO2B = _mm_madd_epi16(T_00_08B, c16_p04_p40); EO3B = _mm_madd_epi16(T_00_08B, c16_n21_p35); EO4B = _mm_madd_epi16(T_00_08B, c16_n40_p29); EO5B = _mm_madd_epi16(T_00_08B, c16_n45_p21); EO6B = _mm_madd_epi16(T_00_08B, c16_n35_p13); EO7B = _mm_madd_epi16(T_00_08B, c16_n13_p04); { const __m128i EEO0A = _mm_madd_epi16(T_00_12A, c16_p38_p44); const __m128i EEO1A = _mm_madd_epi16(T_00_12A, c16_n09_p38); const __m128i EEO2A = _mm_madd_epi16(T_00_12A, c16_n44_p25); const __m128i EEO3A = _mm_madd_epi16(T_00_12A, c16_n25_p09); const __m128i EEO0B = _mm_madd_epi16(T_00_12B, c16_p38_p44); const __m128i EEO1B = _mm_madd_epi16(T_00_12B, c16_n09_p38); const __m128i EEO2B = _mm_madd_epi16(T_00_12B, c16_n44_p25); const __m128i EEO3B = _mm_madd_epi16(T_00_12B, c16_n25_p09); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = EEEE0A; // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = EEEE0B; const __m128i EEE1A = EEEE1A; // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = EEEE1B; const __m128i EEE3A = EEEE0A; // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = EEEE0B; const __m128i EEE2A = EEEE1A; // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = EEEE1B; const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm_packs_epi32(T3_16A, T3_16B); res17[part] = _mm_packs_epi32(T3_17A, T3_17B); res18[part] = _mm_packs_epi32(T3_18A, T3_18B); res19[part] = _mm_packs_epi32(T3_19A, T3_19B); res20[part] = _mm_packs_epi32(T3_20A, T3_20B); res21[part] = _mm_packs_epi32(T3_21A, T3_21B); res22[part] = _mm_packs_epi32(T3_22A, T3_22B); res23[part] = _mm_packs_epi32(T3_23A, T3_23B); res24[part] = _mm_packs_epi32(T3_24A, T3_24B); res25[part] = _mm_packs_epi32(T3_25A, T3_25B); res26[part] = _mm_packs_epi32(T3_26A, T3_26B); res27[part] = _mm_packs_epi32(T3_27A, T3_27B); res28[part] = _mm_packs_epi32(T3_28A, T3_28B); res29[part] = _mm_packs_epi32(T3_29A, T3_29B); res30[part] = _mm_packs_epi32(T3_30A, T3_30B); res31[part] = _mm_packs_epi32(T3_31A, T3_31B); } } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0]) TRANSPOSE_8x8_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_8x8_16BIT(res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0]) TRANSPOSE_8x8_16BIT(res00[3], res01[3], res02[3], res03[3], res04[3], res05[3], res06[3], res07[3], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1]) TRANSPOSE_8x8_16BIT(res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]) TRANSPOSE_8x8_16BIT(res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1]) TRANSPOSE_8x8_16BIT(res08[3], res09[3], res10[3], res11[3], res12[3], res13[3], res14[3], res15[3], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2]) TRANSPOSE_8x8_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]) TRANSPOSE_8x8_16BIT(res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2]) TRANSPOSE_8x8_16BIT(res16[3], res17[3], res18[3], res19[3], res20[3], res21[3], res22[3], res23[3], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[3], in01[3], in02[3], in03[3], in04[3], in05[3], in06[3], in07[3]) TRANSPOSE_8x8_16BIT(res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in08[3], in09[3], in10[3], in11[3], in12[3], in13[3], in14[3], in15[3]) TRANSPOSE_8x8_16BIT(res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2], in16[3], in17[3], in18[3], in19[3], in20[3], in21[3], in22[3], in23[3]) TRANSPOSE_8x8_16BIT(res24[3], res25[3], res26[3], res27[3], res28[3], res29[3], res30[3], res31[3], in24[3], in25[3], in26[3], in27[3], in28[3], in29[3], in30[3], in31[3]) #undef TRANSPOSE_8x8_16BIT } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); int k; for (k = 0; k < 4; k++) { in00[k] = _mm_max_epi16(_mm_min_epi16(in00[k], max_val), min_val); in01[k] = _mm_max_epi16(_mm_min_epi16(in01[k], max_val), min_val); in02[k] = _mm_max_epi16(_mm_min_epi16(in02[k], max_val), min_val); in03[k] = _mm_max_epi16(_mm_min_epi16(in03[k], max_val), min_val); in04[k] = _mm_max_epi16(_mm_min_epi16(in04[k], max_val), min_val); in05[k] = _mm_max_epi16(_mm_min_epi16(in05[k], max_val), min_val); in06[k] = _mm_max_epi16(_mm_min_epi16(in06[k], max_val), min_val); in07[k] = _mm_max_epi16(_mm_min_epi16(in07[k], max_val), min_val); in08[k] = _mm_max_epi16(_mm_min_epi16(in08[k], max_val), min_val); in09[k] = _mm_max_epi16(_mm_min_epi16(in09[k], max_val), min_val); in10[k] = _mm_max_epi16(_mm_min_epi16(in10[k], max_val), min_val); in11[k] = _mm_max_epi16(_mm_min_epi16(in11[k], max_val), min_val); in12[k] = _mm_max_epi16(_mm_min_epi16(in12[k], max_val), min_val); in13[k] = _mm_max_epi16(_mm_min_epi16(in13[k], max_val), min_val); in14[k] = _mm_max_epi16(_mm_min_epi16(in14[k], max_val), min_val); in15[k] = _mm_max_epi16(_mm_min_epi16(in15[k], max_val), min_val); in16[k] = _mm_max_epi16(_mm_min_epi16(in16[k], max_val), min_val); in17[k] = _mm_max_epi16(_mm_min_epi16(in17[k], max_val), min_val); in18[k] = _mm_max_epi16(_mm_min_epi16(in18[k], max_val), min_val); in19[k] = _mm_max_epi16(_mm_min_epi16(in19[k], max_val), min_val); in20[k] = _mm_max_epi16(_mm_min_epi16(in20[k], max_val), min_val); in21[k] = _mm_max_epi16(_mm_min_epi16(in21[k], max_val), min_val); in22[k] = _mm_max_epi16(_mm_min_epi16(in22[k], max_val), min_val); in23[k] = _mm_max_epi16(_mm_min_epi16(in23[k], max_val), min_val); in24[k] = _mm_max_epi16(_mm_min_epi16(in24[k], max_val), min_val); in25[k] = _mm_max_epi16(_mm_min_epi16(in25[k], max_val), min_val); in26[k] = _mm_max_epi16(_mm_min_epi16(in26[k], max_val), min_val); in27[k] = _mm_max_epi16(_mm_min_epi16(in27[k], max_val), min_val); in28[k] = _mm_max_epi16(_mm_min_epi16(in28[k], max_val), min_val); in29[k] = _mm_max_epi16(_mm_min_epi16(in29[k], max_val), min_val); in30[k] = _mm_max_epi16(_mm_min_epi16(in30[k], max_val), min_val); in31[k] = _mm_max_epi16(_mm_min_epi16(in31[k], max_val), min_val); } } // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+0), L0); \ _mm_storeu_si128((__m128i*)(dst + (0 + (offsetV)) * i_dst + (offsetH)+8), H0); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+0), L1); \ _mm_storeu_si128((__m128i*)(dst + (1 + (offsetV)) * i_dst + (offsetH)+8), H1); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+0), L2); \ _mm_storeu_si128((__m128i*)(dst + (2 + (offsetV)) * i_dst + (offsetH)+8), H2); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+0), L3); \ _mm_storeu_si128((__m128i*)(dst + (3 + (offsetV)) * i_dst + (offsetH)+8), H3); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+0), L4); \ _mm_storeu_si128((__m128i*)(dst + (4 + (offsetV)) * i_dst + (offsetH)+8), H4); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+0), L5); \ _mm_storeu_si128((__m128i*)(dst + (5 + (offsetV)) * i_dst + (offsetH)+8), H5); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+0), L6); \ _mm_storeu_si128((__m128i*)(dst + (6 + (offsetV)) * i_dst + (offsetH)+8), H6); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+0), L7); \ _mm_storeu_si128((__m128i*)(dst + (7 + (offsetV)) * i_dst + (offsetH)+8), H7); const int k = i * 2; STORE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16) STORE_LINE(in08[k], in09[k], in10[k], in11[k], in12[k], in13[k], in14[k], in15[k], in08[k + 1], in09[k + 1], in10[k + 1], in11[k + 1], in12[k + 1], in13[k + 1], in14[k + 1], in15[k + 1], 8, i * 16) STORE_LINE(in16[k], in17[k], in18[k], in19[k], in20[k], in21[k], in22[k], in23[k], in16[k + 1], in17[k + 1], in18[k + 1], in19[k + 1], in20[k + 1], in21[k + 1], in22[k + 1], in23[k + 1], 16, i * 16) STORE_LINE(in24[k], in25[k], in26[k], in27[k], in28[k], in29[k], in30[k], in31[k], in24[k + 1], in25[k + 1], in26[k + 1], in27[k + 1], in28[k + 1], in29[k + 1], in30[k + 1], in31[k + 1], 24, i * 16) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_32x8_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { __m128i m128iS0[4], m128iS1[4], m128iS2[4], m128iS3[4], m128iS4[4], m128iS5[4], m128iS6[4], m128iS7[4]; __m128i m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3; __m128i E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l; __m128i O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l; __m128i EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; //int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); int i, pass; i_dst &= 0xFE; /* remember to remove the flag bit */ m128iAdd = _mm_set1_epi32(16); // add1 for (pass = 0; pass < 4; pass++) { m128iS1[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 1 * 32]); m128iS3[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 3 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS1[pass], m128iS3[pass]); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS1[pass], m128iS3[pass]); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128iS5[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 5 * 32]); m128iS7[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 7 * 32]); m128Tmp2 = _mm_unpacklo_epi16(m128iS5[pass], m128iS7[pass]); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(m128iS5[pass], m128iS7[pass]); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128iS0[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 0 * 32]); m128iS4[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 4 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS0[pass], m128iS4[pass]); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS0[pass], m128iS4[pass]); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ m128iS2[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 2 * 32]); m128iS6[pass] = _mm_load_si128((__m128i*)&src[pass * 8 + 6 * 32]); m128Tmp0 = _mm_unpacklo_epi16(m128iS2[pass], m128iS6[pass]); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(m128iS2[pass], m128iS6[pass]); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, m128iAdd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, m128iAdd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, m128iAdd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, m128iAdd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, m128iAdd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, m128iAdd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, m128iAdd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, m128iAdd); m128iS0[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), 5)); // ״η任λ m128iS7[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), 5), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), 5)); m128iS1[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), 5)); m128iS6[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), 5), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), 5)); m128iS2[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), 5)); m128iS5[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), 5), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), 5)); m128iS3[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), 5)); m128iS4[pass] = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), 5), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), 5)); /* Inverts matrix */ E0l = _mm_unpacklo_epi16(m128iS0[pass], m128iS4[pass]); E1l = _mm_unpacklo_epi16(m128iS1[pass], m128iS5[pass]); E2l = _mm_unpacklo_epi16(m128iS2[pass], m128iS6[pass]); E3l = _mm_unpacklo_epi16(m128iS3[pass], m128iS7[pass]); O0l = _mm_unpackhi_epi16(m128iS0[pass], m128iS4[pass]); O1l = _mm_unpackhi_epi16(m128iS1[pass], m128iS5[pass]); O2l = _mm_unpackhi_epi16(m128iS2[pass], m128iS6[pass]); O3l = _mm_unpackhi_epi16(m128iS3[pass], m128iS7[pass]); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); m128iS0[pass] = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS1[pass] = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); m128iS2[pass] = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS3[pass] = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); m128iS4[pass] = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); m128iS5[pass] = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); m128iS6[pass] = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); m128iS7[pass] = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); } { const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 int nShift = shift2; // DCT1 __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4]; __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4]; const __m128i T_00_00A = _mm_unpacklo_epi16(m128iS1[0], m128iS3[0]); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(m128iS1[0], m128iS3[0]); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(m128iS5[0], m128iS7[0]); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(m128iS5[0], m128iS7[0]); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(m128iS1[1], m128iS3[1]); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(m128iS1[1], m128iS3[1]); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(m128iS5[1], m128iS7[1]); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(m128iS5[1], m128iS7[1]); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(m128iS1[2], m128iS3[2]); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(m128iS1[2], m128iS3[2]); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(m128iS5[2], m128iS7[2]); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(m128iS5[2], m128iS7[2]); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(m128iS1[3], m128iS3[3]); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(m128iS1[3], m128iS3[3]); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(m128iS5[3], m128iS7[3]); // const __m128i T_00_07B = _mm_unpackhi_epi16(m128iS5[3], m128iS7[3]); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(m128iS2[0], m128iS6[0]); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(m128iS2[0], m128iS6[0]); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(m128iS2[1], m128iS6[1]); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(m128iS2[1], m128iS6[1]); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(m128iS2[2], m128iS6[2]); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(m128iS2[2], m128iS6[2]); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(m128iS2[3], m128iS6[3]); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(m128iS2[3], m128iS6[3]); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(m128iS4[0], m128iS4[1]); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(m128iS4[0], m128iS4[1]); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(m128iS4[2], m128iS4[3]); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(m128iS4[2], m128iS4[3]); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(m128iS0[1], m128iS0[3]); // const __m128i T_00_14B = _mm_unpackhi_epi16(m128iS0[1], m128iS0[3]); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(m128iS0[0], m128iS0[2]); // const __m128i T_00_15B = _mm_unpackhi_epi16(m128iS0[0], m128iS0[2]); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW { #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[0] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[0] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[0] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[0] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[0] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[0] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[0] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[0] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[0] = _mm_packs_epi32(T3_16A, T3_16B); res17[0] = _mm_packs_epi32(T3_17A, T3_17B); res18[0] = _mm_packs_epi32(T3_18A, T3_18B); res19[0] = _mm_packs_epi32(T3_19A, T3_19B); res20[0] = _mm_packs_epi32(T3_20A, T3_20B); res21[0] = _mm_packs_epi32(T3_21A, T3_21B); res22[0] = _mm_packs_epi32(T3_22A, T3_22B); res23[0] = _mm_packs_epi32(T3_23A, T3_23B); res24[0] = _mm_packs_epi32(T3_24A, T3_24B); res25[0] = _mm_packs_epi32(T3_25A, T3_25B); res26[0] = _mm_packs_epi32(T3_26A, T3_26B); res27[0] = _mm_packs_epi32(T3_27A, T3_27B); res28[0] = _mm_packs_epi32(T3_28A, T3_28B); res29[0] = _mm_packs_epi32(T3_29A, T3_29B); res30[0] = _mm_packs_epi32(T3_30A, T3_30B); res31[0] = _mm_packs_epi32(T3_31A, T3_31B); } //transpose matrix 8x8 16bit. { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); TRANSPOSE_8x8_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], m128iS0[0], m128iS1[0], m128iS2[0], m128iS3[0], m128iS4[0], m128iS5[0], m128iS6[0], m128iS7[0]) TRANSPOSE_8x8_16BIT(res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], m128iS0[1], m128iS1[1], m128iS2[1], m128iS3[1], m128iS4[1], m128iS5[1], m128iS6[1], m128iS7[1]) TRANSPOSE_8x8_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], m128iS0[2], m128iS1[2], m128iS2[2], m128iS3[2], m128iS4[2], m128iS5[2], m128iS6[2], m128iS7[2]) TRANSPOSE_8x8_16BIT(res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], m128iS0[3], m128iS1[3], m128iS2[3], m128iS3[3], m128iS4[3], m128iS5[3], m128iS6[3], m128iS7[3]) #undef TRANSPOSE_8x8_16BIT } } //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); for (i = 0; i < 4; i++) { m128iS0[i] = _mm_min_epi16(m128iS0[i], max_val); m128iS0[i] = _mm_max_epi16(m128iS0[i], min_val); m128iS1[i] = _mm_min_epi16(m128iS1[i], max_val); m128iS1[i] = _mm_max_epi16(m128iS1[i], min_val); m128iS2[i] = _mm_min_epi16(m128iS2[i], max_val); m128iS2[i] = _mm_max_epi16(m128iS2[i], min_val); m128iS3[i] = _mm_min_epi16(m128iS3[i], max_val); m128iS3[i] = _mm_max_epi16(m128iS3[i], min_val); m128iS4[i] = _mm_min_epi16(m128iS4[i], max_val); m128iS4[i] = _mm_max_epi16(m128iS4[i], min_val); m128iS5[i] = _mm_min_epi16(m128iS5[i], max_val); m128iS5[i] = _mm_max_epi16(m128iS5[i], min_val); m128iS6[i] = _mm_min_epi16(m128iS6[i], max_val); m128iS6[i] = _mm_max_epi16(m128iS6[i], min_val); m128iS7[i] = _mm_min_epi16(m128iS7[i], max_val); m128iS7[i] = _mm_max_epi16(m128iS7[i], min_val); } } // coeff_t blk2[32 * 8]; // Add for (i = 0; i < 2; i++) { #define STORE_LINE(L0, L1, L2, L3, offsetV) \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 0), L0); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 8), L1); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 16), L2); \ _mm_store_si128((__m128i*)(dst + offsetV * i_dst + 24), L3); STORE_LINE(m128iS0[0], m128iS0[1], m128iS0[2], m128iS0[3], 0) STORE_LINE(m128iS1[0], m128iS1[1], m128iS1[2], m128iS1[3], 1) STORE_LINE(m128iS2[0], m128iS2[1], m128iS2[2], m128iS2[3], 2) STORE_LINE(m128iS3[0], m128iS3[1], m128iS3[2], m128iS3[3], 3) STORE_LINE(m128iS4[0], m128iS4[1], m128iS4[2], m128iS4[3], 4) STORE_LINE(m128iS5[0], m128iS5[1], m128iS5[2], m128iS5[3], 5) STORE_LINE(m128iS6[0], m128iS6[1], m128iS6[2], m128iS6[3], 6) STORE_LINE(m128iS7[0], m128iS7[1], m128iS7[2], m128iS7[3], 7) #undef STORE_LINE } } /* --------------------------------------------------------------------------- */ void idct_32x8_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ16x8зϵ idct_32x8_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_32x8_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ8x8зϵ idct_32x8_half_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_8x32_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { const __m128i c16_p45_p45 = _mm_set1_epi32(0x002D002D); const __m128i c16_p43_p44 = _mm_set1_epi32(0x002B002C); const __m128i c16_p39_p41 = _mm_set1_epi32(0x00270029); const __m128i c16_p34_p36 = _mm_set1_epi32(0x00220024); const __m128i c16_p27_p30 = _mm_set1_epi32(0x001B001E); const __m128i c16_p19_p23 = _mm_set1_epi32(0x00130017); const __m128i c16_p11_p15 = _mm_set1_epi32(0x000B000F); const __m128i c16_p02_p07 = _mm_set1_epi32(0x00020007); const __m128i c16_p41_p45 = _mm_set1_epi32(0x0029002D); const __m128i c16_p23_p34 = _mm_set1_epi32(0x00170022); const __m128i c16_n02_p11 = _mm_set1_epi32(0xFFFE000B); const __m128i c16_n27_n15 = _mm_set1_epi32(0xFFE5FFF1); const __m128i c16_n43_n36 = _mm_set1_epi32(0xFFD5FFDC); const __m128i c16_n44_n45 = _mm_set1_epi32(0xFFD4FFD3); const __m128i c16_n30_n39 = _mm_set1_epi32(0xFFE2FFD9); const __m128i c16_n07_n19 = _mm_set1_epi32(0xFFF9FFED); const __m128i c16_p34_p44 = _mm_set1_epi32(0x0022002C); const __m128i c16_n07_p15 = _mm_set1_epi32(0xFFF9000F); const __m128i c16_n41_n27 = _mm_set1_epi32(0xFFD7FFE5); const __m128i c16_n39_n45 = _mm_set1_epi32(0xFFD9FFD3); const __m128i c16_n02_n23 = _mm_set1_epi32(0xFFFEFFE9); const __m128i c16_p36_p19 = _mm_set1_epi32(0x00240013); const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p11_p30 = _mm_set1_epi32(0x000B001E); const __m128i c16_p23_p43 = _mm_set1_epi32(0x0017002B); const __m128i c16_n34_n07 = _mm_set1_epi32(0xFFDEFFF9); const __m128i c16_n36_n45 = _mm_set1_epi32(0xFFDCFFD3); const __m128i c16_p19_n11 = _mm_set1_epi32(0x0013FFF5); const __m128i c16_p44_p41 = _mm_set1_epi32(0x002C0029); const __m128i c16_n02_p27 = _mm_set1_epi32(0xFFFE001B); const __m128i c16_n45_n30 = _mm_set1_epi32(0xFFD3FFE2); const __m128i c16_n15_n39 = _mm_set1_epi32(0xFFF1FFD9); const __m128i c16_p11_p41 = _mm_set1_epi32(0x000B0029); const __m128i c16_n45_n27 = _mm_set1_epi32(0xFFD3FFE5); const __m128i c16_p07_n30 = _mm_set1_epi32(0x0007FFE2); const __m128i c16_p43_p39 = _mm_set1_epi32(0x002B0027); const __m128i c16_n23_p15 = _mm_set1_epi32(0xFFE9000F); const __m128i c16_n34_n45 = _mm_set1_epi32(0xFFDEFFD3); const __m128i c16_p36_p02 = _mm_set1_epi32(0x00240002); const __m128i c16_p19_p44 = _mm_set1_epi32(0x0013002C); const __m128i c16_n02_p39 = _mm_set1_epi32(0xFFFE0027); const __m128i c16_n36_n41 = _mm_set1_epi32(0xFFDCFFD7); const __m128i c16_p43_p07 = _mm_set1_epi32(0x002B0007); const __m128i c16_n11_p34 = _mm_set1_epi32(0xFFF50022); const __m128i c16_n30_n44 = _mm_set1_epi32(0xFFE2FFD4); const __m128i c16_p45_p15 = _mm_set1_epi32(0x002D000F); const __m128i c16_n19_p27 = _mm_set1_epi32(0xFFED001B); const __m128i c16_n23_n45 = _mm_set1_epi32(0xFFE9FFD3); const __m128i c16_n15_p36 = _mm_set1_epi32(0xFFF10024); const __m128i c16_n11_n45 = _mm_set1_epi32(0xFFF5FFD3); const __m128i c16_p34_p39 = _mm_set1_epi32(0x00220027); const __m128i c16_n45_n19 = _mm_set1_epi32(0xFFD3FFED); const __m128i c16_p41_n07 = _mm_set1_epi32(0x0029FFF9); const __m128i c16_n23_p30 = _mm_set1_epi32(0xFFE9001E); const __m128i c16_n02_n44 = _mm_set1_epi32(0xFFFEFFD4); const __m128i c16_p27_p43 = _mm_set1_epi32(0x001B002B); const __m128i c16_n27_p34 = _mm_set1_epi32(0xFFE50022); const __m128i c16_p19_n39 = _mm_set1_epi32(0x0013FFD9); const __m128i c16_n11_p43 = _mm_set1_epi32(0xFFF5002B); const __m128i c16_p02_n45 = _mm_set1_epi32(0x0002FFD3); const __m128i c16_p07_p45 = _mm_set1_epi32(0x0007002D); const __m128i c16_n15_n44 = _mm_set1_epi32(0xFFF1FFD4); const __m128i c16_p23_p41 = _mm_set1_epi32(0x00170029); const __m128i c16_n30_n36 = _mm_set1_epi32(0xFFE2FFDC); const __m128i c16_n36_p30 = _mm_set1_epi32(0xFFDC001E); const __m128i c16_p41_n23 = _mm_set1_epi32(0x0029FFE9); const __m128i c16_n44_p15 = _mm_set1_epi32(0xFFD4000F); const __m128i c16_p45_n07 = _mm_set1_epi32(0x002DFFF9); const __m128i c16_n45_n02 = _mm_set1_epi32(0xFFD3FFFE); const __m128i c16_p43_p11 = _mm_set1_epi32(0x002B000B); const __m128i c16_n39_n19 = _mm_set1_epi32(0xFFD9FFED); const __m128i c16_p34_p27 = _mm_set1_epi32(0x0022001B); const __m128i c16_n43_p27 = _mm_set1_epi32(0xFFD5001B); const __m128i c16_p44_n02 = _mm_set1_epi32(0x002CFFFE); const __m128i c16_n30_n23 = _mm_set1_epi32(0xFFE2FFE9); const __m128i c16_p07_p41 = _mm_set1_epi32(0x00070029); const __m128i c16_p19_n45 = _mm_set1_epi32(0x0013FFD3); const __m128i c16_n39_p34 = _mm_set1_epi32(0xFFD90022); const __m128i c16_p45_n11 = _mm_set1_epi32(0x002DFFF5); const __m128i c16_n36_n15 = _mm_set1_epi32(0xFFDCFFF1); const __m128i c16_n45_p23 = _mm_set1_epi32(0xFFD30017); const __m128i c16_p27_p19 = _mm_set1_epi32(0x001B0013); const __m128i c16_p15_n45 = _mm_set1_epi32(0x000FFFD3); const __m128i c16_n44_p30 = _mm_set1_epi32(0xFFD4001E); const __m128i c16_p34_p11 = _mm_set1_epi32(0x0022000B); const __m128i c16_p07_n43 = _mm_set1_epi32(0x0007FFD5); const __m128i c16_n41_p36 = _mm_set1_epi32(0xFFD70024); const __m128i c16_p39_p02 = _mm_set1_epi32(0x00270002); const __m128i c16_n44_p19 = _mm_set1_epi32(0xFFD40013); const __m128i c16_n02_p36 = _mm_set1_epi32(0xFFFE0024); const __m128i c16_p45_n34 = _mm_set1_epi32(0x002DFFDE); const __m128i c16_n15_n23 = _mm_set1_epi32(0xFFF1FFE9); const __m128i c16_n39_p43 = _mm_set1_epi32(0xFFD9002B); const __m128i c16_p30_p07 = _mm_set1_epi32(0x001E0007); const __m128i c16_p27_n45 = _mm_set1_epi32(0x001BFFD3); const __m128i c16_n41_p11 = _mm_set1_epi32(0xFFD7000B); const __m128i c16_n39_p15 = _mm_set1_epi32(0xFFD9000F); const __m128i c16_n30_p45 = _mm_set1_epi32(0xFFE2002D); const __m128i c16_p27_p02 = _mm_set1_epi32(0x001B0002); const __m128i c16_p41_n44 = _mm_set1_epi32(0x0029FFD4); const __m128i c16_n11_n19 = _mm_set1_epi32(0xFFF5FFED); const __m128i c16_n45_p36 = _mm_set1_epi32(0xFFD30024); const __m128i c16_n07_p34 = _mm_set1_epi32(0xFFF90022); const __m128i c16_p43_n23 = _mm_set1_epi32(0x002BFFE9); const __m128i c16_n30_p11 = _mm_set1_epi32(0xFFE2000B); const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_n19_p36 = _mm_set1_epi32(0xFFED0024); const __m128i c16_p23_n02 = _mm_set1_epi32(0x0017FFFE); const __m128i c16_p45_n39 = _mm_set1_epi32(0x002DFFD9); const __m128i c16_p27_n41 = _mm_set1_epi32(0x001BFFD7); const __m128i c16_n15_n07 = _mm_set1_epi32(0xFFF1FFF9); const __m128i c16_n44_p34 = _mm_set1_epi32(0xFFD40022); const __m128i c16_n19_p07 = _mm_set1_epi32(0xFFED0007); const __m128i c16_n39_p30 = _mm_set1_epi32(0xFFD9001E); const __m128i c16_n45_p44 = _mm_set1_epi32(0xFFD3002C); const __m128i c16_n36_p43 = _mm_set1_epi32(0xFFDC002B); const __m128i c16_n15_p27 = _mm_set1_epi32(0xFFF1001B); const __m128i c16_p11_p02 = _mm_set1_epi32(0x000B0002); const __m128i c16_p34_n23 = _mm_set1_epi32(0x0022FFE9); const __m128i c16_p45_n41 = _mm_set1_epi32(0x002DFFD7); const __m128i c16_n07_p02 = _mm_set1_epi32(0xFFF90002); const __m128i c16_n15_p11 = _mm_set1_epi32(0xFFF1000B); const __m128i c16_n23_p19 = _mm_set1_epi32(0xFFE90013); const __m128i c16_n30_p27 = _mm_set1_epi32(0xFFE2001B); const __m128i c16_n36_p34 = _mm_set1_epi32(0xFFDC0022); const __m128i c16_n41_p39 = _mm_set1_epi32(0xFFD70027); const __m128i c16_n44_p43 = _mm_set1_epi32(0xFFD4002B); const __m128i c16_n45_p45 = _mm_set1_epi32(0xFFD3002D); // const __m128i c16_p43_p45 = _mm_set1_epi32(0x002B002D); const __m128i c16_p35_p40 = _mm_set1_epi32(0x00230028); const __m128i c16_p21_p29 = _mm_set1_epi32(0x0015001D); const __m128i c16_p04_p13 = _mm_set1_epi32(0x0004000D); const __m128i c16_p29_p43 = _mm_set1_epi32(0x001D002B); const __m128i c16_n21_p04 = _mm_set1_epi32(0xFFEB0004); const __m128i c16_n45_n40 = _mm_set1_epi32(0xFFD3FFD8); const __m128i c16_n13_n35 = _mm_set1_epi32(0xFFF3FFDD); const __m128i c16_p04_p40 = _mm_set1_epi32(0x00040028); const __m128i c16_n43_n35 = _mm_set1_epi32(0xFFD5FFDD); const __m128i c16_p29_n13 = _mm_set1_epi32(0x001DFFF3); const __m128i c16_p21_p45 = _mm_set1_epi32(0x0015002D); const __m128i c16_n21_p35 = _mm_set1_epi32(0xFFEB0023); const __m128i c16_p04_n43 = _mm_set1_epi32(0x0004FFD5); const __m128i c16_p13_p45 = _mm_set1_epi32(0x000D002D); const __m128i c16_n29_n40 = _mm_set1_epi32(0xFFE3FFD8); const __m128i c16_n40_p29 = _mm_set1_epi32(0xFFD8001D); const __m128i c16_p45_n13 = _mm_set1_epi32(0x002DFFF3); const __m128i c16_n43_n04 = _mm_set1_epi32(0xFFD5FFFC); const __m128i c16_p35_p21 = _mm_set1_epi32(0x00230015); const __m128i c16_n45_p21 = _mm_set1_epi32(0xFFD30015); const __m128i c16_p13_p29 = _mm_set1_epi32(0x000D001D); const __m128i c16_p35_n43 = _mm_set1_epi32(0x0023FFD5); const __m128i c16_n40_p04 = _mm_set1_epi32(0xFFD80004); const __m128i c16_n35_p13 = _mm_set1_epi32(0xFFDD000D); const __m128i c16_n40_p45 = _mm_set1_epi32(0xFFD8002D); const __m128i c16_p04_p21 = _mm_set1_epi32(0x00040015); const __m128i c16_p43_n29 = _mm_set1_epi32(0x002BFFE3); const __m128i c16_n13_p04 = _mm_set1_epi32(0xFFF30004); const __m128i c16_n29_p21 = _mm_set1_epi32(0xFFE30015); const __m128i c16_n40_p35 = _mm_set1_epi32(0xFFD80023); // const __m128i c16_n45_p43 = _mm_set1_epi32(0xFFD3002B); const __m128i c16_p38_p44 = _mm_set1_epi32(0x0026002C); const __m128i c16_p09_p25 = _mm_set1_epi32(0x00090019); const __m128i c16_n09_p38 = _mm_set1_epi32(0xFFF70026); const __m128i c16_n25_n44 = _mm_set1_epi32(0xFFE7FFD4); const __m128i c16_n44_p25 = _mm_set1_epi32(0xFFD40019); const __m128i c16_p38_p09 = _mm_set1_epi32(0x00260009); const __m128i c16_n25_p09 = _mm_set1_epi32(0xFFE70009); const __m128i c16_n44_p38 = _mm_set1_epi32(0xFFD40026); const __m128i c16_p17_p42 = _mm_set1_epi32(0x0011002A); const __m128i c16_n42_p17 = _mm_set1_epi32(0xFFD60011); const __m128i c16_p32_p32 = _mm_set1_epi32(0x00200020); const __m128i c16_n32_p32 = _mm_set1_epi32(0xFFE00020); __m128i c32_rnd = _mm_set1_epi32(16); int nShift = 5, pass; //int shift1 = 5; int shift2 = 20 - g_bit_depth - (i_dst & 0x01); //int clip_depth1 = LIMIT_BIT; int clip_depth2 = g_bit_depth + 1 + (i_dst & 0x01); // DCT1 __m128i in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15; __m128i in16, in17, in18, in19, in20, in21, in22, in23, in24, in25, in26, in27, in28, in29, in30, in31; __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4]; i_dst &= 0xFE; in00 = _mm_load_si128((const __m128i*)&src[0 * 8]); in01 = _mm_load_si128((const __m128i*)&src[ 1 * 8]); in02 = _mm_load_si128((const __m128i*)&src[ 2 * 8]); in03 = _mm_load_si128((const __m128i*)&src[ 3 * 8]); in04 = _mm_load_si128((const __m128i*)&src[ 4 * 8]); in05 = _mm_load_si128((const __m128i*)&src[ 5 * 8]); in06 = _mm_load_si128((const __m128i*)&src[ 6 * 8]); in07 = _mm_load_si128((const __m128i*)&src[ 7 * 8]); in08 = _mm_load_si128((const __m128i*)&src[ 8 * 8]); in09 = _mm_load_si128((const __m128i*)&src[ 9 * 8]); in10 = _mm_load_si128((const __m128i*)&src[10 * 8]); in11 = _mm_load_si128((const __m128i*)&src[11 * 8]); in12 = _mm_load_si128((const __m128i*)&src[12 * 8]); in13 = _mm_load_si128((const __m128i*)&src[13 * 8]); in14 = _mm_load_si128((const __m128i*)&src[14 * 8]); in15 = _mm_load_si128((const __m128i*)&src[15 * 8]); in16 = _mm_load_si128((const __m128i*)&src[16 * 8]); in17 = _mm_load_si128((const __m128i*)&src[17 * 8]); in18 = _mm_load_si128((const __m128i*)&src[18 * 8]); in19 = _mm_load_si128((const __m128i*)&src[19 * 8]); in20 = _mm_load_si128((const __m128i*)&src[20 * 8]); in21 = _mm_load_si128((const __m128i*)&src[21 * 8]); in22 = _mm_load_si128((const __m128i*)&src[22 * 8]); in23 = _mm_load_si128((const __m128i*)&src[23 * 8]); in24 = _mm_load_si128((const __m128i*)&src[24 * 8]); in25 = _mm_load_si128((const __m128i*)&src[25 * 8]); in26 = _mm_load_si128((const __m128i*)&src[26 * 8]); in27 = _mm_load_si128((const __m128i*)&src[27 * 8]); in28 = _mm_load_si128((const __m128i*)&src[28 * 8]); in29 = _mm_load_si128((const __m128i*)&src[29 * 8]); in30 = _mm_load_si128((const __m128i*)&src[30 * 8]); in31 = _mm_load_si128((const __m128i*)&src[31 * 8]); { const __m128i T_00_00A = _mm_unpacklo_epi16(in01, in03); // [33 13 32 12 31 11 30 10] const __m128i T_00_00B = _mm_unpackhi_epi16(in01, in03); // [37 17 36 16 35 15 34 14] const __m128i T_00_01A = _mm_unpacklo_epi16(in05, in07); // [ ] const __m128i T_00_01B = _mm_unpackhi_epi16(in05, in07); // [ ] const __m128i T_00_02A = _mm_unpacklo_epi16(in09, in11); // [ ] const __m128i T_00_02B = _mm_unpackhi_epi16(in09, in11); // [ ] const __m128i T_00_03A = _mm_unpacklo_epi16(in13, in15); // [ ] const __m128i T_00_03B = _mm_unpackhi_epi16(in13, in15); // [ ] const __m128i T_00_04A = _mm_unpacklo_epi16(in17, in19); // [ ] const __m128i T_00_04B = _mm_unpackhi_epi16(in17, in19); // [ ] const __m128i T_00_05A = _mm_unpacklo_epi16(in21, in23); // [ ] const __m128i T_00_05B = _mm_unpackhi_epi16(in21, in23); // [ ] const __m128i T_00_06A = _mm_unpacklo_epi16(in25, in27); // [ ] const __m128i T_00_06B = _mm_unpackhi_epi16(in25, in27); // [ ] const __m128i T_00_07A = _mm_unpacklo_epi16(in29, in31); // const __m128i T_00_07B = _mm_unpackhi_epi16(in29, in31); // [ ] const __m128i T_00_08A = _mm_unpacklo_epi16(in02, in06); // [ ] const __m128i T_00_08B = _mm_unpackhi_epi16(in02, in06); // [ ] const __m128i T_00_09A = _mm_unpacklo_epi16(in10, in14); // [ ] const __m128i T_00_09B = _mm_unpackhi_epi16(in10, in14); // [ ] const __m128i T_00_10A = _mm_unpacklo_epi16(in18, in22); // [ ] const __m128i T_00_10B = _mm_unpackhi_epi16(in18, in22); // [ ] const __m128i T_00_11A = _mm_unpacklo_epi16(in26, in30); // [ ] const __m128i T_00_11B = _mm_unpackhi_epi16(in26, in30); // [ ] const __m128i T_00_12A = _mm_unpacklo_epi16(in04, in12); // [ ] const __m128i T_00_12B = _mm_unpackhi_epi16(in04, in12); // [ ] const __m128i T_00_13A = _mm_unpacklo_epi16(in20, in28); // [ ] const __m128i T_00_13B = _mm_unpackhi_epi16(in20, in28); // [ ] const __m128i T_00_14A = _mm_unpacklo_epi16(in08, in24); // const __m128i T_00_14B = _mm_unpackhi_epi16(in08, in24); // [ ] const __m128i T_00_15A = _mm_unpacklo_epi16(in00, in16); // const __m128i T_00_15B = _mm_unpackhi_epi16(in00, in16); // [ ] __m128i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m128i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m128i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m128i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m128i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(r0103, c0103), _mm_madd_epi16(r0507, c0507)); \ T01 = _mm_add_epi32(_mm_madd_epi16(r0911, c0911), _mm_madd_epi16(r1315, c1315)); \ T02 = _mm_add_epi32(_mm_madd_epi16(r1719, c1719), _mm_madd_epi16(r2123, c2123)); \ T03 = _mm_add_epi32(_mm_madd_epi16(r2527, c2527), _mm_madd_epi16(r2931, c2931)); \ row = _mm_add_epi32(_mm_add_epi32(T00, T01), _mm_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m128i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm_add_epi32(_mm_madd_epi16(row0206, c0206), _mm_madd_epi16(row1014, c1014)); \ T01 = _mm_add_epi32(_mm_madd_epi16(row1822, c1822), _mm_madd_epi16(row2630, c2630)); \ row = _mm_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } { const __m128i EEO0A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_p38_p44), _mm_madd_epi16(T_00_13A, c16_p09_p25)); const __m128i EEO1A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n09_p38), _mm_madd_epi16(T_00_13A, c16_n25_n44)); const __m128i EEO2A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n44_p25), _mm_madd_epi16(T_00_13A, c16_p38_p09)); const __m128i EEO3A = _mm_add_epi32(_mm_madd_epi16(T_00_12A, c16_n25_p09), _mm_madd_epi16(T_00_13A, c16_n44_p38)); const __m128i EEO0B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_p38_p44), _mm_madd_epi16(T_00_13B, c16_p09_p25)); const __m128i EEO1B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n09_p38), _mm_madd_epi16(T_00_13B, c16_n25_n44)); const __m128i EEO2B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n44_p25), _mm_madd_epi16(T_00_13B, c16_p38_p09)); const __m128i EEO3B = _mm_add_epi32(_mm_madd_epi16(T_00_12B, c16_n25_p09), _mm_madd_epi16(T_00_13B, c16_n44_p38)); const __m128i EEEO0A = _mm_madd_epi16(T_00_14A, c16_p17_p42); const __m128i EEEO0B = _mm_madd_epi16(T_00_14B, c16_p17_p42); const __m128i EEEO1A = _mm_madd_epi16(T_00_14A, c16_n42_p17); const __m128i EEEO1B = _mm_madd_epi16(T_00_14B, c16_n42_p17); const __m128i EEEE0A = _mm_madd_epi16(T_00_15A, c16_p32_p32); const __m128i EEEE0B = _mm_madd_epi16(T_00_15B, c16_p32_p32); const __m128i EEEE1A = _mm_madd_epi16(T_00_15A, c16_n32_p32); const __m128i EEEE1B = _mm_madd_epi16(T_00_15B, c16_n32_p32); const __m128i EEE0A = _mm_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 const __m128i EEE0B = _mm_add_epi32(EEEE0B, EEEO0B); const __m128i EEE1A = _mm_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 const __m128i EEE1B = _mm_add_epi32(EEEE1B, EEEO1B); const __m128i EEE3A = _mm_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 const __m128i EEE3B = _mm_sub_epi32(EEEE0B, EEEO0B); const __m128i EEE2A = _mm_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 const __m128i EEE2B = _mm_sub_epi32(EEEE1B, EEEO1B); const __m128i EE0A = _mm_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m128i EE0B = _mm_add_epi32(EEE0B, EEO0B); const __m128i EE1A = _mm_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m128i EE1B = _mm_add_epi32(EEE1B, EEO1B); const __m128i EE2A = _mm_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 const __m128i EE2B = _mm_add_epi32(EEE2B, EEO2B); const __m128i EE3A = _mm_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 const __m128i EE3B = _mm_add_epi32(EEE3B, EEO3B); const __m128i EE7A = _mm_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 const __m128i EE7B = _mm_sub_epi32(EEE0B, EEO0B); const __m128i EE6A = _mm_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 const __m128i EE6B = _mm_sub_epi32(EEE1B, EEO1B); const __m128i EE5A = _mm_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 const __m128i EE5B = _mm_sub_epi32(EEE2B, EEO2B); const __m128i EE4A = _mm_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 const __m128i EE4B = _mm_sub_epi32(EEE3B, EEO3B); const __m128i E0A = _mm_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m128i E0B = _mm_add_epi32(EE0B, EO0B); const __m128i E1A = _mm_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m128i E1B = _mm_add_epi32(EE1B, EO1B); const __m128i E2A = _mm_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m128i E2B = _mm_add_epi32(EE2B, EO2B); const __m128i E3A = _mm_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m128i E3B = _mm_add_epi32(EE3B, EO3B); const __m128i E4A = _mm_add_epi32(EE4A, EO4A); // E4 = const __m128i E4B = _mm_add_epi32(EE4B, EO4B); const __m128i E5A = _mm_add_epi32(EE5A, EO5A); // E5 = const __m128i E5B = _mm_add_epi32(EE5B, EO5B); const __m128i E6A = _mm_add_epi32(EE6A, EO6A); // E6 = const __m128i E6B = _mm_add_epi32(EE6B, EO6B); const __m128i E7A = _mm_add_epi32(EE7A, EO7A); // E7 = const __m128i E7B = _mm_add_epi32(EE7B, EO7B); const __m128i EFA = _mm_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 const __m128i EFB = _mm_sub_epi32(EE0B, EO0B); const __m128i EEA = _mm_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 const __m128i EEB = _mm_sub_epi32(EE1B, EO1B); const __m128i EDA = _mm_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 const __m128i EDB = _mm_sub_epi32(EE2B, EO2B); const __m128i ECA = _mm_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 const __m128i ECB = _mm_sub_epi32(EE3B, EO3B); const __m128i EBA = _mm_sub_epi32(EE4A, EO4A); // EB = const __m128i EBB = _mm_sub_epi32(EE4B, EO4B); const __m128i EAA = _mm_sub_epi32(EE5A, EO5A); // EA = const __m128i EAB = _mm_sub_epi32(EE5B, EO5B); const __m128i E9A = _mm_sub_epi32(EE6A, EO6A); // E9 = const __m128i E9B = _mm_sub_epi32(EE6B, EO6B); const __m128i E8A = _mm_sub_epi32(EE7A, EO7A); // E8 = const __m128i E8B = _mm_sub_epi32(EE7B, EO7B); const __m128i T10A = _mm_add_epi32(E0A, c32_rnd); // E0 + rnd const __m128i T10B = _mm_add_epi32(E0B, c32_rnd); const __m128i T11A = _mm_add_epi32(E1A, c32_rnd); // E1 + rnd const __m128i T11B = _mm_add_epi32(E1B, c32_rnd); const __m128i T12A = _mm_add_epi32(E2A, c32_rnd); // E2 + rnd const __m128i T12B = _mm_add_epi32(E2B, c32_rnd); const __m128i T13A = _mm_add_epi32(E3A, c32_rnd); // E3 + rnd const __m128i T13B = _mm_add_epi32(E3B, c32_rnd); const __m128i T14A = _mm_add_epi32(E4A, c32_rnd); // E4 + rnd const __m128i T14B = _mm_add_epi32(E4B, c32_rnd); const __m128i T15A = _mm_add_epi32(E5A, c32_rnd); // E5 + rnd const __m128i T15B = _mm_add_epi32(E5B, c32_rnd); const __m128i T16A = _mm_add_epi32(E6A, c32_rnd); // E6 + rnd const __m128i T16B = _mm_add_epi32(E6B, c32_rnd); const __m128i T17A = _mm_add_epi32(E7A, c32_rnd); // E7 + rnd const __m128i T17B = _mm_add_epi32(E7B, c32_rnd); const __m128i T18A = _mm_add_epi32(E8A, c32_rnd); // E8 + rnd const __m128i T18B = _mm_add_epi32(E8B, c32_rnd); const __m128i T19A = _mm_add_epi32(E9A, c32_rnd); // E9 + rnd const __m128i T19B = _mm_add_epi32(E9B, c32_rnd); const __m128i T1AA = _mm_add_epi32(EAA, c32_rnd); // E10 + rnd const __m128i T1AB = _mm_add_epi32(EAB, c32_rnd); const __m128i T1BA = _mm_add_epi32(EBA, c32_rnd); // E11 + rnd const __m128i T1BB = _mm_add_epi32(EBB, c32_rnd); const __m128i T1CA = _mm_add_epi32(ECA, c32_rnd); // E12 + rnd const __m128i T1CB = _mm_add_epi32(ECB, c32_rnd); const __m128i T1DA = _mm_add_epi32(EDA, c32_rnd); // E13 + rnd const __m128i T1DB = _mm_add_epi32(EDB, c32_rnd); const __m128i T1EA = _mm_add_epi32(EEA, c32_rnd); // E14 + rnd const __m128i T1EB = _mm_add_epi32(EEB, c32_rnd); const __m128i T1FA = _mm_add_epi32(EFA, c32_rnd); // E15 + rnd const __m128i T1FB = _mm_add_epi32(EFB, c32_rnd); const __m128i T2_00A = _mm_add_epi32(T10A, O00A); // E0 + O0 + rnd const __m128i T2_00B = _mm_add_epi32(T10B, O00B); const __m128i T2_01A = _mm_add_epi32(T11A, O01A); // E1 + O1 + rnd const __m128i T2_01B = _mm_add_epi32(T11B, O01B); const __m128i T2_02A = _mm_add_epi32(T12A, O02A); // E2 + O2 + rnd const __m128i T2_02B = _mm_add_epi32(T12B, O02B); const __m128i T2_03A = _mm_add_epi32(T13A, O03A); // E3 + O3 + rnd const __m128i T2_03B = _mm_add_epi32(T13B, O03B); const __m128i T2_04A = _mm_add_epi32(T14A, O04A); // E4 const __m128i T2_04B = _mm_add_epi32(T14B, O04B); const __m128i T2_05A = _mm_add_epi32(T15A, O05A); // E5 const __m128i T2_05B = _mm_add_epi32(T15B, O05B); const __m128i T2_06A = _mm_add_epi32(T16A, O06A); // E6 const __m128i T2_06B = _mm_add_epi32(T16B, O06B); const __m128i T2_07A = _mm_add_epi32(T17A, O07A); // E7 const __m128i T2_07B = _mm_add_epi32(T17B, O07B); const __m128i T2_08A = _mm_add_epi32(T18A, O08A); // E8 const __m128i T2_08B = _mm_add_epi32(T18B, O08B); const __m128i T2_09A = _mm_add_epi32(T19A, O09A); // E9 const __m128i T2_09B = _mm_add_epi32(T19B, O09B); const __m128i T2_10A = _mm_add_epi32(T1AA, O10A); // E10 const __m128i T2_10B = _mm_add_epi32(T1AB, O10B); const __m128i T2_11A = _mm_add_epi32(T1BA, O11A); // E11 const __m128i T2_11B = _mm_add_epi32(T1BB, O11B); const __m128i T2_12A = _mm_add_epi32(T1CA, O12A); // E12 const __m128i T2_12B = _mm_add_epi32(T1CB, O12B); const __m128i T2_13A = _mm_add_epi32(T1DA, O13A); // E13 const __m128i T2_13B = _mm_add_epi32(T1DB, O13B); const __m128i T2_14A = _mm_add_epi32(T1EA, O14A); // E14 const __m128i T2_14B = _mm_add_epi32(T1EB, O14B); const __m128i T2_15A = _mm_add_epi32(T1FA, O15A); // E15 const __m128i T2_15B = _mm_add_epi32(T1FB, O15B); const __m128i T2_31A = _mm_sub_epi32(T10A, O00A); // E0 - O0 + rnd const __m128i T2_31B = _mm_sub_epi32(T10B, O00B); const __m128i T2_30A = _mm_sub_epi32(T11A, O01A); // E1 - O1 + rnd const __m128i T2_30B = _mm_sub_epi32(T11B, O01B); const __m128i T2_29A = _mm_sub_epi32(T12A, O02A); // E2 - O2 + rnd const __m128i T2_29B = _mm_sub_epi32(T12B, O02B); const __m128i T2_28A = _mm_sub_epi32(T13A, O03A); // E3 - O3 + rnd const __m128i T2_28B = _mm_sub_epi32(T13B, O03B); const __m128i T2_27A = _mm_sub_epi32(T14A, O04A); // E4 const __m128i T2_27B = _mm_sub_epi32(T14B, O04B); const __m128i T2_26A = _mm_sub_epi32(T15A, O05A); // E5 const __m128i T2_26B = _mm_sub_epi32(T15B, O05B); const __m128i T2_25A = _mm_sub_epi32(T16A, O06A); // E6 const __m128i T2_25B = _mm_sub_epi32(T16B, O06B); const __m128i T2_24A = _mm_sub_epi32(T17A, O07A); // E7 const __m128i T2_24B = _mm_sub_epi32(T17B, O07B); const __m128i T2_23A = _mm_sub_epi32(T18A, O08A); // const __m128i T2_23B = _mm_sub_epi32(T18B, O08B); const __m128i T2_22A = _mm_sub_epi32(T19A, O09A); // const __m128i T2_22B = _mm_sub_epi32(T19B, O09B); const __m128i T2_21A = _mm_sub_epi32(T1AA, O10A); // const __m128i T2_21B = _mm_sub_epi32(T1AB, O10B); const __m128i T2_20A = _mm_sub_epi32(T1BA, O11A); // const __m128i T2_20B = _mm_sub_epi32(T1BB, O11B); const __m128i T2_19A = _mm_sub_epi32(T1CA, O12A); // const __m128i T2_19B = _mm_sub_epi32(T1CB, O12B); const __m128i T2_18A = _mm_sub_epi32(T1DA, O13A); // const __m128i T2_18B = _mm_sub_epi32(T1DB, O13B); const __m128i T2_17A = _mm_sub_epi32(T1EA, O14A); // const __m128i T2_17B = _mm_sub_epi32(T1EB, O14B); const __m128i T2_16A = _mm_sub_epi32(T1FA, O15A); // const __m128i T2_16B = _mm_sub_epi32(T1FB, O15B); const __m128i T3_00A = _mm_srai_epi32(T2_00A, nShift); // [30 20 10 00] const __m128i T3_00B = _mm_srai_epi32(T2_00B, nShift); // [70 60 50 40] const __m128i T3_01A = _mm_srai_epi32(T2_01A, nShift); // [31 21 11 01] const __m128i T3_01B = _mm_srai_epi32(T2_01B, nShift); // [71 61 51 41] const __m128i T3_02A = _mm_srai_epi32(T2_02A, nShift); // [32 22 12 02] const __m128i T3_02B = _mm_srai_epi32(T2_02B, nShift); // [72 62 52 42] const __m128i T3_03A = _mm_srai_epi32(T2_03A, nShift); // [33 23 13 03] const __m128i T3_03B = _mm_srai_epi32(T2_03B, nShift); // [73 63 53 43] const __m128i T3_04A = _mm_srai_epi32(T2_04A, nShift); // [33 24 14 04] const __m128i T3_04B = _mm_srai_epi32(T2_04B, nShift); // [74 64 54 44] const __m128i T3_05A = _mm_srai_epi32(T2_05A, nShift); // [35 25 15 05] const __m128i T3_05B = _mm_srai_epi32(T2_05B, nShift); // [75 65 55 45] const __m128i T3_06A = _mm_srai_epi32(T2_06A, nShift); // [36 26 16 06] const __m128i T3_06B = _mm_srai_epi32(T2_06B, nShift); // [76 66 56 46] const __m128i T3_07A = _mm_srai_epi32(T2_07A, nShift); // [37 27 17 07] const __m128i T3_07B = _mm_srai_epi32(T2_07B, nShift); // [77 67 57 47] const __m128i T3_08A = _mm_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 const __m128i T3_08B = _mm_srai_epi32(T2_08B, nShift); // [70 60 50 40] const __m128i T3_09A = _mm_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 const __m128i T3_09B = _mm_srai_epi32(T2_09B, nShift); // [71 61 51 41] const __m128i T3_10A = _mm_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA const __m128i T3_10B = _mm_srai_epi32(T2_10B, nShift); // [72 62 52 42] const __m128i T3_11A = _mm_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB const __m128i T3_11B = _mm_srai_epi32(T2_11B, nShift); // [73 63 53 43] const __m128i T3_12A = _mm_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC const __m128i T3_12B = _mm_srai_epi32(T2_12B, nShift); // [74 64 54 44] const __m128i T3_13A = _mm_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD const __m128i T3_13B = _mm_srai_epi32(T2_13B, nShift); // [75 65 55 45] const __m128i T3_14A = _mm_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE const __m128i T3_14B = _mm_srai_epi32(T2_14B, nShift); // [76 66 56 46] const __m128i T3_15A = _mm_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF const __m128i T3_15B = _mm_srai_epi32(T2_15B, nShift); // [77 67 57 47] const __m128i T3_16A = _mm_srai_epi32(T2_16A, nShift); // [30 20 10 00] const __m128i T3_16B = _mm_srai_epi32(T2_16B, nShift); // [70 60 50 40] const __m128i T3_17A = _mm_srai_epi32(T2_17A, nShift); // [31 21 11 01] const __m128i T3_17B = _mm_srai_epi32(T2_17B, nShift); // [71 61 51 41] const __m128i T3_18A = _mm_srai_epi32(T2_18A, nShift); // [32 22 12 02] const __m128i T3_18B = _mm_srai_epi32(T2_18B, nShift); // [72 62 52 42] const __m128i T3_19A = _mm_srai_epi32(T2_19A, nShift); // [33 23 13 03] const __m128i T3_19B = _mm_srai_epi32(T2_19B, nShift); // [73 63 53 43] const __m128i T3_20A = _mm_srai_epi32(T2_20A, nShift); // [33 24 14 04] const __m128i T3_20B = _mm_srai_epi32(T2_20B, nShift); // [74 64 54 44] const __m128i T3_21A = _mm_srai_epi32(T2_21A, nShift); // [35 25 15 05] const __m128i T3_21B = _mm_srai_epi32(T2_21B, nShift); // [75 65 55 45] const __m128i T3_22A = _mm_srai_epi32(T2_22A, nShift); // [36 26 16 06] const __m128i T3_22B = _mm_srai_epi32(T2_22B, nShift); // [76 66 56 46] const __m128i T3_23A = _mm_srai_epi32(T2_23A, nShift); // [37 27 17 07] const __m128i T3_23B = _mm_srai_epi32(T2_23B, nShift); // [77 67 57 47] const __m128i T3_24A = _mm_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 const __m128i T3_24B = _mm_srai_epi32(T2_24B, nShift); // [70 60 50 40] const __m128i T3_25A = _mm_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 const __m128i T3_25B = _mm_srai_epi32(T2_25B, nShift); // [71 61 51 41] const __m128i T3_26A = _mm_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA const __m128i T3_26B = _mm_srai_epi32(T2_26B, nShift); // [72 62 52 42] const __m128i T3_27A = _mm_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB const __m128i T3_27B = _mm_srai_epi32(T2_27B, nShift); // [73 63 53 43] const __m128i T3_28A = _mm_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC const __m128i T3_28B = _mm_srai_epi32(T2_28B, nShift); // [74 64 54 44] const __m128i T3_29A = _mm_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD const __m128i T3_29B = _mm_srai_epi32(T2_29B, nShift); // [75 65 55 45] const __m128i T3_30A = _mm_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE const __m128i T3_30B = _mm_srai_epi32(T2_30B, nShift); // [76 66 56 46] const __m128i T3_31A = _mm_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF const __m128i T3_31B = _mm_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[0] = _mm_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[0] = _mm_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[0] = _mm_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[0] = _mm_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[0] = _mm_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[0] = _mm_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[0] = _mm_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[0] = _mm_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res00[1] = _mm_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res01[1] = _mm_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res02[1] = _mm_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res03[1] = _mm_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res04[1] = _mm_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res05[1] = _mm_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res06[1] = _mm_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res07[1] = _mm_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res00[2] = _mm_packs_epi32(T3_16A, T3_16B); res01[2] = _mm_packs_epi32(T3_17A, T3_17B); res02[2] = _mm_packs_epi32(T3_18A, T3_18B); res03[2] = _mm_packs_epi32(T3_19A, T3_19B); res04[2] = _mm_packs_epi32(T3_20A, T3_20B); res05[2] = _mm_packs_epi32(T3_21A, T3_21B); res06[2] = _mm_packs_epi32(T3_22A, T3_22B); res07[2] = _mm_packs_epi32(T3_23A, T3_23B); res00[3] = _mm_packs_epi32(T3_24A, T3_24B); res01[3] = _mm_packs_epi32(T3_25A, T3_25B); res02[3] = _mm_packs_epi32(T3_26A, T3_26B); res03[3] = _mm_packs_epi32(T3_27A, T3_27B); res04[3] = _mm_packs_epi32(T3_28A, T3_28B); res05[3] = _mm_packs_epi32(T3_29A, T3_29B); res06[3] = _mm_packs_epi32(T3_30A, T3_30B); res07[3] = _mm_packs_epi32(T3_31A, T3_31B); } } #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); //clip { __m128i max_val = _mm_set1_epi16((1 << (clip_depth2 - 1)) - 1); __m128i min_val = _mm_set1_epi16(-(1 << (clip_depth2 - 1))); c32_rnd = _mm_set1_epi32(1 << (shift2 - 1)); // add2 nShift = shift2; for (pass = 0; pass < 4; pass++) { __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m128i m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; TRANSPOSE_8x8_16BIT(res00[pass], res01[pass], res02[pass], res03[pass], res04[pass], res05[pass], res06[pass], res07[pass], in00, in01, in02, in03, in04, in05, in06, in07) m128Tmp0 = _mm_unpacklo_epi16(in01, in03); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp1 = _mm_unpackhi_epi16(in01, in03); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0]))); m128Tmp2 = _mm_unpacklo_epi16(in05, in07); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); m128Tmp3 = _mm_unpackhi_epi16(in05, in07); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[1]))); O0l = _mm_add_epi32(E1l, E2l); O0h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[2]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[3]))); O1l = _mm_add_epi32(E1l, E2l); O1h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[4]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[5]))); O2l = _mm_add_epi32(E1l, E2l); O2h = _mm_add_epi32(E1h, E2h); E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[6]))); E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); E2h = _mm_madd_epi16(m128Tmp3, _mm_load_si128((__m128i*)(tab_idct_8x8[7]))); O3h = _mm_add_epi32(E1h, E2h); O3l = _mm_add_epi32(E1l, E2l); /* ------- */ m128Tmp0 = _mm_unpacklo_epi16(in00, in04); EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); m128Tmp1 = _mm_unpackhi_epi16(in00, in04); EE0h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[8]))); EE1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); EE1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[9]))); /* ------- */ m128Tmp0 = _mm_unpacklo_epi16(in02, in06); E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); m128Tmp1 = _mm_unpackhi_epi16(in02, in06); E00h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[10]))); E01l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E01h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[11]))); E0l = _mm_add_epi32(EE0l, E00l); E0l = _mm_add_epi32(E0l, c32_rnd); E0h = _mm_add_epi32(EE0h, E00h); E0h = _mm_add_epi32(E0h, c32_rnd); E3l = _mm_sub_epi32(EE0l, E00l); E3l = _mm_add_epi32(E3l, c32_rnd); E3h = _mm_sub_epi32(EE0h, E00h); E3h = _mm_add_epi32(E3h, c32_rnd); E1l = _mm_add_epi32(EE1l, E01l); E1l = _mm_add_epi32(E1l, c32_rnd); E1h = _mm_add_epi32(EE1h, E01h); E1h = _mm_add_epi32(E1h, c32_rnd); E2l = _mm_sub_epi32(EE1l, E01l); E2l = _mm_add_epi32(E2l, c32_rnd); E2h = _mm_sub_epi32(EE1h, E01h); E2h = _mm_add_epi32(E2h, c32_rnd); in00 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_add_epi32(E0h, O0h), nShift)); // ״η任λ in07 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E0l, O0l), nShift), _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), nShift)); in01 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_add_epi32(E1h, O1h), nShift)); in06 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E1l, O1l), nShift), _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), nShift)); in02 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E2l, O2l), nShift), _mm_srai_epi32(_mm_add_epi32(E2h, O2h), nShift)); in05 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E2l, O2l), nShift), _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), nShift)); in03 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(E3l, O3l), nShift), _mm_srai_epi32(_mm_add_epi32(E3h, O3h), nShift)); in04 = _mm_packs_epi32(_mm_srai_epi32(_mm_sub_epi32(E3l, O3l), nShift), _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), nShift)); /* Invers matrix */ E0l = _mm_unpacklo_epi16(in00, in04); E1l = _mm_unpacklo_epi16(in01, in05); E2l = _mm_unpacklo_epi16(in02, in06); E3l = _mm_unpacklo_epi16(in03, in07); O0l = _mm_unpackhi_epi16(in00, in04); O1l = _mm_unpackhi_epi16(in01, in05); O2l = _mm_unpackhi_epi16(in02, in06); O3l = _mm_unpackhi_epi16(in03, in07); m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); in00 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); in00 = _mm_min_epi16(in00, max_val); in00 = _mm_max_epi16(in00, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 0 * 8], in00); in01 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); in01 = _mm_min_epi16(in01, max_val); in01 = _mm_max_epi16(in01, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 1 * 8], in01); m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); in02 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); in02 = _mm_min_epi16(in02, max_val); in02 = _mm_max_epi16(in02, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 2 * 8], in02); in03 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); in03 = _mm_min_epi16(in03, max_val); in03 = _mm_max_epi16(in03, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 3 * 8], in03); m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); in04 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); in04 = _mm_min_epi16(in04, max_val); in04 = _mm_max_epi16(in04, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 4 * 8], in04); in05 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); in05 = _mm_min_epi16(in05, max_val); in05 = _mm_max_epi16(in05, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 5 * 8], in05); m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); in06 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); in06 = _mm_min_epi16(in06, max_val); in06 = _mm_max_epi16(in06, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 6 * 8], in06); in07 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); in07 = _mm_min_epi16(in07, max_val); in07 = _mm_max_epi16(in07, min_val); _mm_store_si128((__m128i*)&dst[pass * 8 * i_dst + 7 * 8], in07); } } #undef TRANSPOSE_8x8_16BIT } /* --------------------------------------------------------------------------- */ void idct_8x32_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/2СϽǵ8x16зϵ idct_8x32_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ void idct_8x32_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { // TODO: implement this // 1/4СϽǵ8x8зϵ idct_8x32_half_sse128(src, dst, i_dst); } /* --------------------------------------------------------------------------- */ static void inv_2nd_trans_hor_sse128(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { int rnd_factor = 1 << (i_shift - 1); int j; __m128i factor = _mm_set1_epi32(rnd_factor); __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load tc data, a matrix of 4x4 __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&tc[0 * SEC_TR_SIZE + 0]); // tc[0][] & tc[1][] __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&tc[2 * SEC_TR_SIZE + 0]); // tc[2][] & tc[3][] __m128i tmpCoef0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tc[0][] __m128i tmpCoef1 = _mm_unpackhi_epi16(tmpLoad0, tmpZero); // tc[1][] __m128i tmpCoef2 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tc[2][] __m128i tmpCoef3 = _mm_unpackhi_epi16(tmpLoad1, tmpZero); // tc[3][] for (j = 0; j < 4; j++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(tmpCoef0, _mm_set1_epi32(coeff[0])); __m128i tmpProduct1 = _mm_madd_epi16(tmpCoef1, _mm_set1_epi32(coeff[1])); __m128i tmpProduct2 = _mm_madd_epi16(tmpCoef2, _mm_set1_epi32(coeff[2])); __m128i tmpProduct3 = _mm_madd_epi16(tmpCoef3, _mm_set1_epi32(coeff[3])); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), i_shift); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)coeff, tmpDst0); // store from &coeff[0] coeff += i_coeff; } } /* --------------------------------------------------------------------------- */ static void inv_2nd_trans_ver_sse128(coeff_t *coeff, int i_coeff, int i_shift, const int16_t *tc) { const int rnd_factor = 1 << (i_shift - 1); __m128i factor = _mm_set1_epi32(rnd_factor); __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load coeff data __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&coeff[0 ]); __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&coeff[1 * i_coeff]); __m128i tmpLoad2 = _mm_loadu_si128((__m128i*)&coeff[2 * i_coeff]); __m128i tmpLoad3 = _mm_loadu_si128((__m128i*)&coeff[3 * i_coeff]); __m128i tmpSrc0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tmpSrc[0][] __m128i tmpSrc1 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tmpSrc[1][] __m128i tmpSrc2 = _mm_unpacklo_epi16(tmpLoad2, tmpZero); // tmpSrc[2][] __m128i tmpSrc3 = _mm_unpacklo_epi16(tmpLoad3, tmpZero); // tmpSrc[3][] int i; for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(_mm_set1_epi32(tc[0 * SEC_TR_SIZE + i]), tmpSrc0); __m128i tmpProduct1 = _mm_madd_epi16(_mm_set1_epi32(tc[1 * SEC_TR_SIZE + i]), tmpSrc1); __m128i tmpProduct2 = _mm_madd_epi16(_mm_set1_epi32(tc[2 * SEC_TR_SIZE + i]), tmpSrc2); __m128i tmpProduct3 = _mm_madd_epi16(_mm_set1_epi32(tc[3 * SEC_TR_SIZE + i]), tmpSrc3); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), i_shift); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! // store from &coeff[0] _mm_storel_epi64((__m128i*)&coeff[0 * i_coeff + 0], tmpDst0); coeff += i_coeff; } } /* --------------------------------------------------------------------------- */ void inv_transform_2nd_sse128(coeff_t *coeff, int i_coeff, int i_mode, int b_top, int b_left) { int vt = (i_mode >= 0 && i_mode <= 23); int ht = (i_mode >= 13 && i_mode <= 32) || (i_mode >= 0 && i_mode <= 2); if (ht && b_left) { inv_2nd_trans_hor_sse128(coeff, i_coeff, 7, g_2T); } if (vt && b_top) { inv_2nd_trans_ver_sse128(coeff, i_coeff, 7, g_2T); } } /* --------------------------------------------------------------------------- */ void inv_transform_4x4_2nd_sse128(coeff_t *coeff, int i_coeff) { const int shift1 = 5; const int shift2 = 20 - g_bit_depth + 2; const int clip_depth2 = g_bit_depth + 1; /*---vertical transform first---*/ __m128i factor = _mm_set1_epi32(1 << (shift1 - 1)); // add1 __m128i tmpZero = _mm_setzero_si128(); // 0 elements // load coeff data __m128i tmpLoad0 = _mm_loadu_si128((__m128i*)&coeff[0 ]); __m128i tmpLoad1 = _mm_loadu_si128((__m128i*)&coeff[1 * i_coeff]); __m128i tmpLoad2 = _mm_loadu_si128((__m128i*)&coeff[2 * i_coeff]); __m128i tmpLoad3 = _mm_loadu_si128((__m128i*)&coeff[3 * i_coeff]); __m128i tmpSrc0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // tmpSrc[0][] __m128i tmpSrc1 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // tmpSrc[1][] __m128i tmpSrc2 = _mm_unpacklo_epi16(tmpLoad2, tmpZero); // tmpSrc[2][] __m128i tmpSrc3 = _mm_unpacklo_epi16(tmpLoad3, tmpZero); // tmpSrc[3][] int i; for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[0 * SEC_TR_SIZE + i]), tmpSrc0); __m128i tmpProduct1 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[1 * SEC_TR_SIZE + i]), tmpSrc1); __m128i tmpProduct2 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[2 * SEC_TR_SIZE + i]), tmpSrc2); __m128i tmpProduct3 = _mm_madd_epi16(_mm_set1_epi32(g_2T_C[3 * SEC_TR_SIZE + i]), tmpSrc3); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), shift1); // clip3 operation tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)&coeff[i * i_coeff + 0], tmpDst0); // store from &coeff[0] } /*---hor transform---*/ factor = _mm_set1_epi32(1 << (shift2 - 1)); const __m128i vmax_val = _mm_set1_epi32((1 << (clip_depth2 - 1)) - 1); const __m128i vmin_val = _mm_set1_epi32(-(1 << (clip_depth2 - 1))); //load coef data, a matrix of 4x4 tmpLoad0 = _mm_loadu_si128((__m128i*)&g_2T_C[0 * SEC_TR_SIZE + 0]); // coef[0][] & coef[1][] tmpLoad1 = _mm_loadu_si128((__m128i*)&g_2T_C[2 * SEC_TR_SIZE + 0]); // coef[2][] & coef[3][] const __m128i tmpCoef0 = _mm_unpacklo_epi16(tmpLoad0, tmpZero); // coef[0][] const __m128i tmpCoef1 = _mm_unpackhi_epi16(tmpLoad0, tmpZero); // coef[1][] const __m128i tmpCoef2 = _mm_unpacklo_epi16(tmpLoad1, tmpZero); // coef[2][] const __m128i tmpCoef3 = _mm_unpackhi_epi16(tmpLoad1, tmpZero); // coef[3][] for (i = 0; i < 4; i++) { // multiple & add __m128i tmpProduct0 = _mm_madd_epi16(tmpCoef0, _mm_set1_epi32(coeff[0])); __m128i tmpProduct1 = _mm_madd_epi16(tmpCoef1, _mm_set1_epi32(coeff[1])); __m128i tmpProduct2 = _mm_madd_epi16(tmpCoef2, _mm_set1_epi32(coeff[2])); __m128i tmpProduct3 = _mm_madd_epi16(tmpCoef3, _mm_set1_epi32(coeff[3])); // add operation __m128i tmpDst0 = _mm_add_epi32(_mm_add_epi32(tmpProduct0, tmpProduct1), _mm_add_epi32(tmpProduct2, tmpProduct3)); // shift operation tmpDst0 = _mm_srai_epi32(_mm_add_epi32(tmpDst0, factor), shift2); // clip3 operation tmpDst0 = _mm_max_epi32(_mm_min_epi32(tmpDst0, vmax_val), vmin_val); tmpDst0 = _mm_packs_epi32(tmpDst0, tmpZero); // only low 64bits (4xSHORT) are valid! _mm_storel_epi64((__m128i*)coeff, tmpDst0); // store from &coeff[0] coeff += i_coeff; } } // transpose 8x8 & transpose 16x16 #define TRANSPOSE_8x8_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm_unpacklo_epi16(I0, I1); \ tr0_1 = _mm_unpacklo_epi16(I2, I3); \ tr0_2 = _mm_unpackhi_epi16(I0, I1); \ tr0_3 = _mm_unpackhi_epi16(I2, I3); \ tr0_4 = _mm_unpacklo_epi16(I4, I5); \ tr0_5 = _mm_unpacklo_epi16(I6, I7); \ tr0_6 = _mm_unpackhi_epi16(I4, I5); \ tr0_7 = _mm_unpackhi_epi16(I6, I7); \ tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ #define TRANSPOSE_16x16_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1) \ TRANSPOSE_8x8_16BIT(A0_0, A1_0, A2_0, A3_0, A4_0, A5_0, A6_0, A7_0, B0_0, B1_0, B2_0, B3_0, B4_0, B5_0, B6_0, B7_0); \ TRANSPOSE_8x8_16BIT(A8_0, A9_0, A10_0, A11_0, A12_0, A13_0, A14_0, A15_0, B0_1, B1_1, B2_1, B3_1, B4_1, B5_1, B6_1, B7_1); \ TRANSPOSE_8x8_16BIT(A0_1, A1_1, A2_1, A3_1, A4_1, A5_1, A6_1, A7_1, B8_0, B9_0, B10_0, B11_0, B12_0, B13_0, B14_0, B15_0); \ TRANSPOSE_8x8_16BIT(A8_1, A9_1, A10_1, A11_1, A12_1, A13_1, A14_1, A15_1, B8_1, B9_1, B10_1, B11_1, B12_1, B13_1, B14_1, B15_1); \ /* --------------------------------------------------------------------------- */ void inv_wavelet_64x64_sse128(coeff_t *coeff) { int i; // 64*64 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8], T16[8], T17[8], T18[8], T19[8], T20[8], T21[8], T22[8], T23[8], T24[8], T25[8], T26[8], T27[8], T28[8], T29[8], T30[8], T31[8], T32[8], T33[8], T34[8], T35[8], T36[8], T37[8], T38[8], T39[8], T40[8], T41[8], T42[8], T43[8], T44[8], T45[8], T46[8], T47[8], T48[8], T49[8], T50[8], T51[8], T52[8], T53[8], T54[8], T55[8], T56[8], T57[8], T58[8], T59[8], T60[8], T61[8], T62[8], T63[8]; // 16*64 __m128i V00[8], V01[8], V02[8], V03[8], V04[8], V05[8], V06[8], V07[8], V08[8], V09[8], V10[8], V11[8], V12[8], V13[8], V14[8], V15[8], V16[8], V17[8], V18[8], V19[8], V20[8], V21[8], V22[8], V23[8], V24[8], V25[8], V26[8], V27[8], V28[8], V29[8], V30[8], V31[8], V32[8], V33[8], V34[8], V35[8], V36[8], V37[8], V38[8], V39[8], V40[8], V41[8], V42[8], V43[8], V44[8], V45[8], V46[8], V47[8], V48[8], V49[8], V50[8], V51[8], V52[8], V53[8], V54[8], V55[8], V56[8], V57[8], V58[8], V59[8], V60[8], V61[8], V62[8], V63[8]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; /*--vertical transform--*/ //32*32, LOAD AND SHIFT for (i = 0; i < 4; i++) { T00[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 0]), 1); T01[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 1]), 1); T02[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 2]), 1); T03[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 3]), 1); T04[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 4]), 1); T05[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 5]), 1); T06[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 6]), 1); T07[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 7]), 1); T08[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 8]), 1); T09[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 9]), 1); T10[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 10]), 1); T11[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 11]), 1); T12[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 12]), 1); T13[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 13]), 1); T14[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 14]), 1); T15[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 15]), 1); T16[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 16]), 1); T17[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 17]), 1); T18[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 18]), 1); T19[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 19]), 1); T20[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 20]), 1); T21[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 21]), 1); T22[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 22]), 1); T23[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 23]), 1); T24[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 24]), 1); T25[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 25]), 1); T26[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 26]), 1); T27[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 27]), 1); T28[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 28]), 1); T29[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 29]), 1); T30[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 30]), 1); T31[i] = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * i + 32 * 31]), 1); } //filter (odd pixel/row) for (i = 0; i < 4; i++) { T32[i] = _mm_srai_epi16(_mm_add_epi16(T00[i], T01[i]), 1); T33[i] = _mm_srai_epi16(_mm_add_epi16(T01[i], T02[i]), 1); T34[i] = _mm_srai_epi16(_mm_add_epi16(T02[i], T03[i]), 1); T35[i] = _mm_srai_epi16(_mm_add_epi16(T03[i], T04[i]), 1); T36[i] = _mm_srai_epi16(_mm_add_epi16(T04[i], T05[i]), 1); T37[i] = _mm_srai_epi16(_mm_add_epi16(T05[i], T06[i]), 1); T38[i] = _mm_srai_epi16(_mm_add_epi16(T06[i], T07[i]), 1); T39[i] = _mm_srai_epi16(_mm_add_epi16(T07[i], T08[i]), 1); T40[i] = _mm_srai_epi16(_mm_add_epi16(T08[i], T09[i]), 1); T41[i] = _mm_srai_epi16(_mm_add_epi16(T09[i], T10[i]), 1); T42[i] = _mm_srai_epi16(_mm_add_epi16(T10[i], T11[i]), 1); T43[i] = _mm_srai_epi16(_mm_add_epi16(T11[i], T12[i]), 1); T44[i] = _mm_srai_epi16(_mm_add_epi16(T12[i], T13[i]), 1); T45[i] = _mm_srai_epi16(_mm_add_epi16(T13[i], T14[i]), 1); T46[i] = _mm_srai_epi16(_mm_add_epi16(T14[i], T15[i]), 1); T47[i] = _mm_srai_epi16(_mm_add_epi16(T15[i], T16[i]), 1); T48[i] = _mm_srai_epi16(_mm_add_epi16(T16[i], T17[i]), 1); T49[i] = _mm_srai_epi16(_mm_add_epi16(T17[i], T18[i]), 1); T50[i] = _mm_srai_epi16(_mm_add_epi16(T18[i], T19[i]), 1); T51[i] = _mm_srai_epi16(_mm_add_epi16(T19[i], T20[i]), 1); T52[i] = _mm_srai_epi16(_mm_add_epi16(T20[i], T21[i]), 1); T53[i] = _mm_srai_epi16(_mm_add_epi16(T21[i], T22[i]), 1); T54[i] = _mm_srai_epi16(_mm_add_epi16(T22[i], T23[i]), 1); T55[i] = _mm_srai_epi16(_mm_add_epi16(T23[i], T24[i]), 1); T56[i] = _mm_srai_epi16(_mm_add_epi16(T24[i], T25[i]), 1); T57[i] = _mm_srai_epi16(_mm_add_epi16(T25[i], T26[i]), 1); T58[i] = _mm_srai_epi16(_mm_add_epi16(T26[i], T27[i]), 1); T59[i] = _mm_srai_epi16(_mm_add_epi16(T27[i], T28[i]), 1); T60[i] = _mm_srai_epi16(_mm_add_epi16(T28[i], T29[i]), 1); T61[i] = _mm_srai_epi16(_mm_add_epi16(T29[i], T30[i]), 1); T62[i] = _mm_srai_epi16(_mm_add_epi16(T30[i], T31[i]), 1); T63[i] = _mm_srai_epi16(_mm_add_epi16(T31[i], T31[i]), 1); } /*--transposition--*/ //32x64 -> 64x32 TRANSPOSE_16x16_16BIT( T00[0], T32[0], T01[0], T33[0], T02[0], T34[0], T03[0], T35[0], T04[0], T36[0], T05[0], T37[0], T06[0], T38[0], T07[0], T39[0], T00[1], T32[1], T01[1], T33[1], T02[1], T34[1], T03[1], T35[1], T04[1], T36[1], T05[1], T37[1], T06[1], T38[1], T07[1], T39[1], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_16x16_16BIT( T00[2], T32[2], T01[2], T33[2], T02[2], T34[2], T03[2], T35[2], T04[2], T36[2], T05[2], T37[2], T06[2], T38[2], T07[2], T39[2], T00[3], T32[3], T01[3], T33[3], T02[3], T34[3], T03[3], T35[3], T04[3], T36[3], T05[3], T37[3], T06[3], T38[3], T07[3], T39[3], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_16x16_16BIT( T08[0], T40[0], T09[0], T41[0], T10[0], T42[0], T11[0], T43[0], T12[0], T44[0], T13[0], T45[0], T14[0], T46[0], T15[0], T47[0], T08[1], T40[1], T09[1], T41[1], T10[1], T42[1], T11[1], T43[1], T12[1], T44[1], T13[1], T45[1], T14[1], T46[1], T15[1], T47[1], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3]); TRANSPOSE_16x16_16BIT( T08[2], T40[2], T09[2], T41[2], T10[2], T42[2], T11[2], T43[2], T12[2], T44[2], T13[2], T45[2], T14[2], T46[2], T15[2], T47[2], T08[3], T40[3], T09[3], T41[3], T10[3], T42[3], T11[3], T43[3], T12[3], T44[3], T13[3], T45[3], T14[3], T46[3], T15[3], T47[3], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3]); TRANSPOSE_16x16_16BIT( T16[0], T48[0], T17[0], T49[0], T18[0], T50[0], T19[0], T51[0], T20[0], T52[0], T21[0], T53[0], T22[0], T54[0], T23[0], T55[0], T16[1], T48[1], T17[1], T49[1], T18[1], T50[1], T19[1], T51[1], T20[1], T52[1], T21[1], T53[1], T22[1], T54[1], T23[1], T55[1], V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V00[5], V01[5], V02[5], V03[5], V04[5], V05[5], V06[5], V07[5], V08[5], V09[5], V10[5], V11[5], V12[5], V13[5], V14[5], V15[5]); TRANSPOSE_16x16_16BIT( T16[2], T48[2], T17[2], T49[2], T18[2], T50[2], T19[2], T51[2], T20[2], T52[2], T21[2], T53[2], T22[2], T54[2], T23[2], T55[2], T16[3], T48[3], T17[3], T49[3], T18[3], T50[3], T19[3], T51[3], T20[3], T52[3], T21[3], T53[3], T22[3], T54[3], T23[3], T55[3], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V16[5], V17[5], V18[5], V19[5], V20[5], V21[5], V22[5], V23[5], V24[5], V25[5], V26[5], V27[5], V28[5], V29[5], V30[5], V31[5]); TRANSPOSE_16x16_16BIT( T24[0], T56[0], T25[0], T57[0], T26[0], T58[0], T27[0], T59[0], T28[0], T60[0], T29[0], T61[0], T30[0], T62[0], T31[0], T63[0], T24[1], T56[1], T25[1], T57[1], T26[1], T58[1], T27[1], T59[1], T28[1], T60[1], T29[1], T61[1], T30[1], T62[1], T31[1], T63[1], V00[6], V01[6], V02[6], V03[6], V04[6], V05[6], V06[6], V07[6], V08[6], V09[6], V10[6], V11[6], V12[6], V13[6], V14[6], V15[6], V00[7], V01[7], V02[7], V03[7], V04[7], V05[7], V06[7], V07[7], V08[7], V09[7], V10[7], V11[7], V12[7], V13[7], V14[7], V15[7]); TRANSPOSE_16x16_16BIT( T24[2], T56[2], T25[2], T57[2], T26[2], T58[2], T27[2], T59[2], T28[2], T60[2], T29[2], T61[2], T30[2], T62[2], T31[2], T63[2], T24[3], T56[3], T25[3], T57[3], T26[3], T58[3], T27[3], T59[3], T28[3], T60[3], T29[3], T61[3], T30[3], T62[3], T31[3], T63[3], V16[6], V17[6], V18[6], V19[6], V20[6], V21[6], V22[6], V23[6], V24[6], V25[6], V26[6], V27[6], V28[6], V29[6], V30[6], V31[6], V16[7], V17[7], V18[7], V19[7], V20[7], V21[7], V22[7], V23[7], V24[7], V25[7], V26[7], V27[7], V28[7], V29[7], V30[7], V31[7]); /*--horizontal transform--*/ //filter (odd pixel/column) for (i = 0; i < 8; i++) { V32[i] = _mm_srai_epi16(_mm_add_epi16(V00[i], V01[i]), 1); V33[i] = _mm_srai_epi16(_mm_add_epi16(V01[i], V02[i]), 1); V34[i] = _mm_srai_epi16(_mm_add_epi16(V02[i], V03[i]), 1); V35[i] = _mm_srai_epi16(_mm_add_epi16(V03[i], V04[i]), 1); V36[i] = _mm_srai_epi16(_mm_add_epi16(V04[i], V05[i]), 1); V37[i] = _mm_srai_epi16(_mm_add_epi16(V05[i], V06[i]), 1); V38[i] = _mm_srai_epi16(_mm_add_epi16(V06[i], V07[i]), 1); V39[i] = _mm_srai_epi16(_mm_add_epi16(V07[i], V08[i]), 1); V40[i] = _mm_srai_epi16(_mm_add_epi16(V08[i], V09[i]), 1); V41[i] = _mm_srai_epi16(_mm_add_epi16(V09[i], V10[i]), 1); V42[i] = _mm_srai_epi16(_mm_add_epi16(V10[i], V11[i]), 1); V43[i] = _mm_srai_epi16(_mm_add_epi16(V11[i], V12[i]), 1); V44[i] = _mm_srai_epi16(_mm_add_epi16(V12[i], V13[i]), 1); V45[i] = _mm_srai_epi16(_mm_add_epi16(V13[i], V14[i]), 1); V46[i] = _mm_srai_epi16(_mm_add_epi16(V14[i], V15[i]), 1); V47[i] = _mm_srai_epi16(_mm_add_epi16(V15[i], V16[i]), 1); V48[i] = _mm_srai_epi16(_mm_add_epi16(V16[i], V17[i]), 1); V49[i] = _mm_srai_epi16(_mm_add_epi16(V17[i], V18[i]), 1); V50[i] = _mm_srai_epi16(_mm_add_epi16(V18[i], V19[i]), 1); V51[i] = _mm_srai_epi16(_mm_add_epi16(V19[i], V20[i]), 1); V52[i] = _mm_srai_epi16(_mm_add_epi16(V20[i], V21[i]), 1); V53[i] = _mm_srai_epi16(_mm_add_epi16(V21[i], V22[i]), 1); V54[i] = _mm_srai_epi16(_mm_add_epi16(V22[i], V23[i]), 1); V55[i] = _mm_srai_epi16(_mm_add_epi16(V23[i], V24[i]), 1); V56[i] = _mm_srai_epi16(_mm_add_epi16(V24[i], V25[i]), 1); V57[i] = _mm_srai_epi16(_mm_add_epi16(V25[i], V26[i]), 1); V58[i] = _mm_srai_epi16(_mm_add_epi16(V26[i], V27[i]), 1); V59[i] = _mm_srai_epi16(_mm_add_epi16(V27[i], V28[i]), 1); V60[i] = _mm_srai_epi16(_mm_add_epi16(V28[i], V29[i]), 1); V61[i] = _mm_srai_epi16(_mm_add_epi16(V29[i], V30[i]), 1); V62[i] = _mm_srai_epi16(_mm_add_epi16(V30[i], V31[i]), 1); V63[i] = _mm_srai_epi16(_mm_add_epi16(V31[i], V31[i]), 1); } /*--transposition & Store--*/ //64x64 TRANSPOSE_16x16_16BIT( V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT( V00[2], V32[2], V01[2], V33[2], V02[2], V34[2], V03[2], V35[2], V04[2], V36[2], V05[2], V37[2], V06[2], V38[2], V07[2], V39[2], V00[3], V32[3], V01[3], V33[3], V02[3], V34[3], V03[3], V35[3], V04[3], V36[3], V05[3], V37[3], V06[3], V38[3], V07[3], V39[3], T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1]); TRANSPOSE_16x16_16BIT(V00[4], V32[4], V01[4], V33[4], V02[4], V34[4], V03[4], V35[4], V04[4], V36[4], V05[4], V37[4], V06[4], V38[4], V07[4], V39[4], V00[5], V32[5], V01[5], V33[5], V02[5], V34[5], V03[5], V35[5], V04[5], V36[5], V05[5], V37[5], V06[5], V38[5], V07[5], V39[5], T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1]); TRANSPOSE_16x16_16BIT(V00[6], V32[6], V01[6], V33[6], V02[6], V34[6], V03[6], V35[6], V04[6], V36[6], V05[6], V37[6], V06[6], V38[6], V07[6], V39[6], V00[7], V32[7], V01[7], V33[7], V02[7], V34[7], V03[7], V35[7], V04[7], V36[7], V05[7], V37[7], V06[7], V38[7], V07[7], V39[7], T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1]); TRANSPOSE_16x16_16BIT( V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_16x16_16BIT( V08[2], V40[2], V09[2], V41[2], V10[2], V42[2], V11[2], V43[2], V12[2], V44[2], V13[2], V45[2], V14[2], V46[2], V15[2], V47[2], V08[3], V40[3], V09[3], V41[3], V10[3], V42[3], V11[3], V43[3], V12[3], V44[3], V13[3], V45[3], V14[3], V46[3], V15[3], V47[3], T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2], T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3]); TRANSPOSE_16x16_16BIT( V08[4], V40[4], V09[4], V41[4], V10[4], V42[4], V11[4], V43[4], V12[4], V44[4], V13[4], V45[4], V14[4], V46[4], V15[4], V47[4], V08[5], V40[5], V09[5], V41[5], V10[5], V42[5], V11[5], V43[5], V12[5], V44[5], V13[5], V45[5], V14[5], V46[5], V15[5], V47[5], T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2], T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3]); TRANSPOSE_16x16_16BIT( V08[6], V40[6], V09[6], V41[6], V10[6], V42[6], V11[6], V43[6], V12[6], V44[6], V13[6], V45[6], V14[6], V46[6], V15[6], V47[6], V08[7], V40[7], V09[7], V41[7], V10[7], V42[7], V11[7], V43[7], V12[7], V44[7], V13[7], V45[7], V14[7], V46[7], V15[7], V47[7], T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2], T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3]); TRANSPOSE_16x16_16BIT( V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5]); TRANSPOSE_16x16_16BIT( V16[2], V48[2], V17[2], V49[2], V18[2], V50[2], V19[2], V51[2], V20[2], V52[2], V21[2], V53[2], V22[2], V54[2], V23[2], V55[2], V16[3], V48[3], V17[3], V49[3], V18[3], V50[3], V19[3], V51[3], V20[3], V52[3], V21[3], V53[3], V22[3], V54[3], V23[3], V55[3], T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T16[5], T17[5], T18[5], T19[5], T20[5], T21[5], T22[5], T23[5], T24[5], T25[5], T26[5], T27[5], T28[5], T29[5], T30[5], T31[5]); TRANSPOSE_16x16_16BIT( V16[4], V48[4], V17[4], V49[4], V18[4], V50[4], V19[4], V51[4], V20[4], V52[4], V21[4], V53[4], V22[4], V54[4], V23[4], V55[4], V16[5], V48[5], V17[5], V49[5], V18[5], V50[5], V19[5], V51[5], V20[5], V52[5], V21[5], V53[5], V22[5], V54[5], V23[5], V55[5], T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T32[5], T33[5], T34[5], T35[5], T36[5], T37[5], T38[5], T39[5], T40[5], T41[5], T42[5], T43[5], T44[5], T45[5], T46[5], T47[5]); TRANSPOSE_16x16_16BIT( V16[6], V48[6], V17[6], V49[6], V18[6], V50[6], V19[6], V51[6], V20[6], V52[6], V21[6], V53[6], V22[6], V54[6], V23[6], V55[6], V16[7], V48[7], V17[7], V49[7], V18[7], V50[7], V19[7], V51[7], V20[7], V52[7], V21[7], V53[7], V22[7], V54[7], V23[7], V55[7], T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4], T48[5], T49[5], T50[5], T51[5], T52[5], T53[5], T54[5], T55[5], T56[5], T57[5], T58[5], T59[5], T60[5], T61[5], T62[5], T63[5]); TRANSPOSE_16x16_16BIT( V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7]); TRANSPOSE_16x16_16BIT( V24[2], V56[2], V25[2], V57[2], V26[2], V58[2], V27[2], V59[2], V28[2], V60[2], V29[2], V61[2], V30[2], V62[2], V31[2], V63[2], V24[3], V56[3], V25[3], V57[3], V26[3], V58[3], V27[3], V59[3], V28[3], V60[3], V29[3], V61[3], V30[3], V62[3], V31[3], V63[3], T16[6], T17[6], T18[6], T19[6], T20[6], T21[6], T22[6], T23[6], T24[6], T25[6], T26[6], T27[6], T28[6], T29[6], T30[6], T31[6], T16[7], T17[7], T18[7], T19[7], T20[7], T21[7], T22[7], T23[7], T24[7], T25[7], T26[7], T27[7], T28[7], T29[7], T30[7], T31[7]); TRANSPOSE_16x16_16BIT( V24[4], V56[4], V25[4], V57[4], V26[4], V58[4], V27[4], V59[4], V28[4], V60[4], V29[4], V61[4], V30[4], V62[4], V31[4], V63[4], V24[5], V56[5], V25[5], V57[5], V26[5], V58[5], V27[5], V59[5], V28[5], V60[5], V29[5], V61[5], V30[5], V62[5], V31[5], V63[5], T32[6], T33[6], T34[6], T35[6], T36[6], T37[6], T38[6], T39[6], T40[6], T41[6], T42[6], T43[6], T44[6], T45[6], T46[6], T47[6], T32[7], T33[7], T34[7], T35[7], T36[7], T37[7], T38[7], T39[7], T40[7], T41[7], T42[7], T43[7], T44[7], T45[7], T46[7], T47[7]); TRANSPOSE_16x16_16BIT( V24[6], V56[6], V25[6], V57[6], V26[6], V58[6], V27[6], V59[6], V28[6], V60[6], V29[6], V61[6], V30[6], V62[6], V31[6], V63[6], V24[7], V56[7], V25[7], V57[7], V26[7], V58[7], V27[7], V59[7], V28[7], V60[7], V29[7], V61[7], V30[7], V62[7], V31[7], V63[7], T48[6], T49[6], T50[6], T51[6], T52[6], T53[6], T54[6], T55[6], T56[6], T57[6], T58[6], T59[6], T60[6], T61[6], T62[6], T63[6], T48[7], T49[7], T50[7], T51[7], T52[7], T53[7], T54[7], T55[7], T56[7], T57[7], T58[7], T59[7], T60[7], T61[7], T62[7], T63[7]); //store for (i = 0; i < 8; i++) { _mm_storeu_si128((__m128i*)&coeff[8 * i ], T00[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 ], T01[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 2], T02[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 3], T03[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 4], T04[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 5], T05[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 6], T06[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 7], T07[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 8], T08[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 9], T09[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 10], T10[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 11], T11[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 12], T12[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 13], T13[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 14], T14[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 15], T15[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 16], T16[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 17], T17[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 18], T18[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 19], T19[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 20], T20[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 21], T21[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 22], T22[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 23], T23[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 24], T24[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 25], T25[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 26], T26[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 27], T27[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 28], T28[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 29], T29[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 30], T30[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 31], T31[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 32], T32[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 33], T33[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 34], T34[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 35], T35[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 36], T36[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 37], T37[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 38], T38[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 39], T39[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 40], T40[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 41], T41[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 42], T42[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 43], T43[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 44], T44[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 45], T45[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 46], T46[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 47], T47[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 48], T48[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 49], T49[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 50], T50[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 51], T51[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 52], T52[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 53], T53[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 54], T54[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 55], T55[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 56], T56[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 57], T57[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 58], T58[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 59], T59[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 60], T60[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 61], T61[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 62], T62[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 64 * 63], T63[i]); } } /* --------------------------------------------------------------------------- */ void inv_wavelet_64x16_sse128(coeff_t *coeff) { int i; // 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; // 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; /*--vertical transform--*/ //32*8, LOAD AND SHIFT T00[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 0]), 1); T01[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 1]), 1); T02[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 2]), 1); T03[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 3]), 1); T04[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 4]), 1); T05[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 5]), 1); T06[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 6]), 1); T07[0] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 0 + 32 * 7]), 1); T00[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 0]), 1); T01[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 1]), 1); T02[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 2]), 1); T03[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 3]), 1); T04[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 4]), 1); T05[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 5]), 1); T06[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 6]), 1); T07[1] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[ 8 + 32 * 7]), 1); T00[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 0]), 1); T01[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 1]), 1); T02[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 2]), 1); T03[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 3]), 1); T04[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 4]), 1); T05[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 5]), 1); T06[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 6]), 1); T07[2] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[16 + 32 * 7]), 1); T00[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 0]), 1); T01[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 1]), 1); T02[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 2]), 1); T03[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 3]), 1); T04[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 4]), 1); T05[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 5]), 1); T06[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 6]), 1); T07[3] = _mm_srai_epi16(_mm_load_si128((__m128i*)&coeff[24 + 32 * 7]), 1); //filter (odd pixel/row) T08[0] = _mm_srai_epi16(_mm_add_epi16(T00[0], T01[0]), 1); T09[0] = _mm_srai_epi16(_mm_add_epi16(T01[0], T02[0]), 1); T10[0] = _mm_srai_epi16(_mm_add_epi16(T02[0], T03[0]), 1); T11[0] = _mm_srai_epi16(_mm_add_epi16(T03[0], T04[0]), 1); T12[0] = _mm_srai_epi16(_mm_add_epi16(T04[0], T05[0]), 1); T13[0] = _mm_srai_epi16(_mm_add_epi16(T05[0], T06[0]), 1); T14[0] = _mm_srai_epi16(_mm_add_epi16(T06[0], T07[0]), 1); T15[0] = _mm_srai_epi16(_mm_add_epi16(T07[0], T07[0]), 1); T08[1] = _mm_srai_epi16(_mm_add_epi16(T00[1], T01[1]), 1); T09[1] = _mm_srai_epi16(_mm_add_epi16(T01[1], T02[1]), 1); T10[1] = _mm_srai_epi16(_mm_add_epi16(T02[1], T03[1]), 1); T11[1] = _mm_srai_epi16(_mm_add_epi16(T03[1], T04[1]), 1); T12[1] = _mm_srai_epi16(_mm_add_epi16(T04[1], T05[1]), 1); T13[1] = _mm_srai_epi16(_mm_add_epi16(T05[1], T06[1]), 1); T14[1] = _mm_srai_epi16(_mm_add_epi16(T06[1], T07[1]), 1); T15[1] = _mm_srai_epi16(_mm_add_epi16(T07[1], T07[1]), 1); T08[2] = _mm_srai_epi16(_mm_add_epi16(T00[2], T01[2]), 1); T09[2] = _mm_srai_epi16(_mm_add_epi16(T01[2], T02[2]), 1); T10[2] = _mm_srai_epi16(_mm_add_epi16(T02[2], T03[2]), 1); T11[2] = _mm_srai_epi16(_mm_add_epi16(T03[2], T04[2]), 1); T12[2] = _mm_srai_epi16(_mm_add_epi16(T04[2], T05[2]), 1); T13[2] = _mm_srai_epi16(_mm_add_epi16(T05[2], T06[2]), 1); T14[2] = _mm_srai_epi16(_mm_add_epi16(T06[2], T07[2]), 1); T15[2] = _mm_srai_epi16(_mm_add_epi16(T07[2], T07[2]), 1); T08[3] = _mm_srai_epi16(_mm_add_epi16(T00[3], T01[3]), 1); T09[3] = _mm_srai_epi16(_mm_add_epi16(T01[3], T02[3]), 1); T10[3] = _mm_srai_epi16(_mm_add_epi16(T02[3], T03[3]), 1); T11[3] = _mm_srai_epi16(_mm_add_epi16(T03[3], T04[3]), 1); T12[3] = _mm_srai_epi16(_mm_add_epi16(T04[3], T05[3]), 1); T13[3] = _mm_srai_epi16(_mm_add_epi16(T05[3], T06[3]), 1); T14[3] = _mm_srai_epi16(_mm_add_epi16(T06[3], T07[3]), 1); T15[3] = _mm_srai_epi16(_mm_add_epi16(T07[3], T07[3]), 1); /*--transposition--*/ //32x16 -> 16x32 TRANSPOSE_8x8_16BIT(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0]); TRANSPOSE_8x8_16BIT(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_8x8_16BIT(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0]); TRANSPOSE_8x8_16BIT(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_8x8_16BIT(T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1]); TRANSPOSE_8x8_16BIT(T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_8x8_16BIT(T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1]); TRANSPOSE_8x8_16BIT(T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); /*--horizontal transform--*/ //filter (odd pixel/column) V32[0] = _mm_srai_epi16(_mm_add_epi16(V00[0], V01[0]), 1); V33[0] = _mm_srai_epi16(_mm_add_epi16(V01[0], V02[0]), 1); V34[0] = _mm_srai_epi16(_mm_add_epi16(V02[0], V03[0]), 1); V35[0] = _mm_srai_epi16(_mm_add_epi16(V03[0], V04[0]), 1); V36[0] = _mm_srai_epi16(_mm_add_epi16(V04[0], V05[0]), 1); V37[0] = _mm_srai_epi16(_mm_add_epi16(V05[0], V06[0]), 1); V38[0] = _mm_srai_epi16(_mm_add_epi16(V06[0], V07[0]), 1); V39[0] = _mm_srai_epi16(_mm_add_epi16(V07[0], V08[0]), 1); V40[0] = _mm_srai_epi16(_mm_add_epi16(V08[0], V09[0]), 1); V41[0] = _mm_srai_epi16(_mm_add_epi16(V09[0], V10[0]), 1); V42[0] = _mm_srai_epi16(_mm_add_epi16(V10[0], V11[0]), 1); V43[0] = _mm_srai_epi16(_mm_add_epi16(V11[0], V12[0]), 1); V44[0] = _mm_srai_epi16(_mm_add_epi16(V12[0], V13[0]), 1); V45[0] = _mm_srai_epi16(_mm_add_epi16(V13[0], V14[0]), 1); V46[0] = _mm_srai_epi16(_mm_add_epi16(V14[0], V15[0]), 1); V47[0] = _mm_srai_epi16(_mm_add_epi16(V15[0], V16[0]), 1); V48[0] = _mm_srai_epi16(_mm_add_epi16(V16[0], V17[0]), 1); V49[0] = _mm_srai_epi16(_mm_add_epi16(V17[0], V18[0]), 1); V50[0] = _mm_srai_epi16(_mm_add_epi16(V18[0], V19[0]), 1); V51[0] = _mm_srai_epi16(_mm_add_epi16(V19[0], V20[0]), 1); V52[0] = _mm_srai_epi16(_mm_add_epi16(V20[0], V21[0]), 1); V53[0] = _mm_srai_epi16(_mm_add_epi16(V21[0], V22[0]), 1); V54[0] = _mm_srai_epi16(_mm_add_epi16(V22[0], V23[0]), 1); V55[0] = _mm_srai_epi16(_mm_add_epi16(V23[0], V24[0]), 1); V56[0] = _mm_srai_epi16(_mm_add_epi16(V24[0], V25[0]), 1); V57[0] = _mm_srai_epi16(_mm_add_epi16(V25[0], V26[0]), 1); V58[0] = _mm_srai_epi16(_mm_add_epi16(V26[0], V27[0]), 1); V59[0] = _mm_srai_epi16(_mm_add_epi16(V27[0], V28[0]), 1); V60[0] = _mm_srai_epi16(_mm_add_epi16(V28[0], V29[0]), 1); V61[0] = _mm_srai_epi16(_mm_add_epi16(V29[0], V30[0]), 1); V62[0] = _mm_srai_epi16(_mm_add_epi16(V30[0], V31[0]), 1); V63[0] = _mm_srai_epi16(_mm_add_epi16(V31[0], V31[0]), 1); V32[1] = _mm_srai_epi16(_mm_add_epi16(V00[1], V01[1]), 1); V33[1] = _mm_srai_epi16(_mm_add_epi16(V01[1], V02[1]), 1); V34[1] = _mm_srai_epi16(_mm_add_epi16(V02[1], V03[1]), 1); V35[1] = _mm_srai_epi16(_mm_add_epi16(V03[1], V04[1]), 1); V36[1] = _mm_srai_epi16(_mm_add_epi16(V04[1], V05[1]), 1); V37[1] = _mm_srai_epi16(_mm_add_epi16(V05[1], V06[1]), 1); V38[1] = _mm_srai_epi16(_mm_add_epi16(V06[1], V07[1]), 1); V39[1] = _mm_srai_epi16(_mm_add_epi16(V07[1], V08[1]), 1); V40[1] = _mm_srai_epi16(_mm_add_epi16(V08[1], V09[1]), 1); V41[1] = _mm_srai_epi16(_mm_add_epi16(V09[1], V10[1]), 1); V42[1] = _mm_srai_epi16(_mm_add_epi16(V10[1], V11[1]), 1); V43[1] = _mm_srai_epi16(_mm_add_epi16(V11[1], V12[1]), 1); V44[1] = _mm_srai_epi16(_mm_add_epi16(V12[1], V13[1]), 1); V45[1] = _mm_srai_epi16(_mm_add_epi16(V13[1], V14[1]), 1); V46[1] = _mm_srai_epi16(_mm_add_epi16(V14[1], V15[1]), 1); V47[1] = _mm_srai_epi16(_mm_add_epi16(V15[1], V16[1]), 1); V48[1] = _mm_srai_epi16(_mm_add_epi16(V16[1], V17[1]), 1); V49[1] = _mm_srai_epi16(_mm_add_epi16(V17[1], V18[1]), 1); V50[1] = _mm_srai_epi16(_mm_add_epi16(V18[1], V19[1]), 1); V51[1] = _mm_srai_epi16(_mm_add_epi16(V19[1], V20[1]), 1); V52[1] = _mm_srai_epi16(_mm_add_epi16(V20[1], V21[1]), 1); V53[1] = _mm_srai_epi16(_mm_add_epi16(V21[1], V22[1]), 1); V54[1] = _mm_srai_epi16(_mm_add_epi16(V22[1], V23[1]), 1); V55[1] = _mm_srai_epi16(_mm_add_epi16(V23[1], V24[1]), 1); V56[1] = _mm_srai_epi16(_mm_add_epi16(V24[1], V25[1]), 1); V57[1] = _mm_srai_epi16(_mm_add_epi16(V25[1], V26[1]), 1); V58[1] = _mm_srai_epi16(_mm_add_epi16(V26[1], V27[1]), 1); V59[1] = _mm_srai_epi16(_mm_add_epi16(V27[1], V28[1]), 1); V60[1] = _mm_srai_epi16(_mm_add_epi16(V28[1], V29[1]), 1); V61[1] = _mm_srai_epi16(_mm_add_epi16(V29[1], V30[1]), 1); V62[1] = _mm_srai_epi16(_mm_add_epi16(V30[1], V31[1]), 1); V63[1] = _mm_srai_epi16(_mm_add_epi16(V31[1], V31[1]), 1); /*--transposition & Store--*/ //16x64 -> 64x16 TRANSPOSE_8x8_16BIT(V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0]); TRANSPOSE_8x8_16BIT(V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1]); TRANSPOSE_8x8_16BIT(V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2]); TRANSPOSE_8x8_16BIT(V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3]); TRANSPOSE_8x8_16BIT(V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4]); TRANSPOSE_8x8_16BIT(V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5]); TRANSPOSE_8x8_16BIT(V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6]); TRANSPOSE_8x8_16BIT(V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7]); TRANSPOSE_8x8_16BIT(V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_8x8_16BIT(V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_8x8_16BIT(V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_8x8_16BIT(V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_8x8_16BIT(V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]); TRANSPOSE_8x8_16BIT(V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T08[5], T09[5], T10[5], T11[5], T12[5], T13[5], T14[5], T15[5]); TRANSPOSE_8x8_16BIT(V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], T08[6], T09[6], T10[6], T11[6], T12[6], T13[6], T14[6], T15[6]); TRANSPOSE_8x8_16BIT(V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T08[7], T09[7], T10[7], T11[7], T12[7], T13[7], T14[7], T15[7]); //store for (i = 0; i < 8; i++) { _mm_store_si128((__m128i*)&coeff[8 * i ], T00[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 ], T01[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 2], T02[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 3], T03[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 4], T04[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 5], T05[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 6], T06[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 7], T07[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 8], T08[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 9], T09[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 10], T10[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 11], T11[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 12], T12[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 13], T13[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 14], T14[i]); _mm_store_si128((__m128i*)&coeff[8 * i + 64 * 15], T15[i]); } } /* --------------------------------------------------------------------------- */ void inv_wavelet_16x64_sse128(coeff_t *coeff) { //src coeff 8*32 __m128i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31; __m128i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63; // 64*16 __m128i T00[8], T01[8], T02[8], T03[8], T04[8], T05[8], T06[8], T07[8], T08[8], T09[8], T10[8], T11[8], T12[8], T13[8], T14[8], T15[8]; // 16*64 __m128i V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2], V32[2], V33[2], V34[2], V35[2], V36[2], V37[2], V38[2], V39[2], V40[2], V41[2], V42[2], V43[2], V44[2], V45[2], V46[2], V47[2], V48[2], V49[2], V50[2], V51[2], V52[2], V53[2], V54[2], V55[2], V56[2], V57[2], V58[2], V59[2], V60[2], V61[2], V62[2], V63[2]; __m128i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m128i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; int i; /*--load & shift--*/ //8*32 S00 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 0]), 1); S01 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 1]), 1); S02 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 2]), 1); S03 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 3]), 1); S04 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 4]), 1); S05 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 5]), 1); S06 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 6]), 1); S07 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 7]), 1); S08 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 8]), 1); S09 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 9]), 1); S10 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 10]), 1); S11 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 11]), 1); S12 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 12]), 1); S13 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 13]), 1); S14 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 14]), 1); S15 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 15]), 1); S16 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 16]), 1); S17 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 17]), 1); S18 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 18]), 1); S19 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 19]), 1); S20 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 20]), 1); S21 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 21]), 1); S22 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 22]), 1); S23 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 23]), 1); S24 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 24]), 1); S25 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 25]), 1); S26 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 26]), 1); S27 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 27]), 1); S28 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 28]), 1); S29 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 29]), 1); S30 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 30]), 1); S31 = _mm_srai_epi16(_mm_loadu_si128((__m128i*)&coeff[8 * 31]), 1); /*--vertical transform--*/ S32 = _mm_srai_epi16(_mm_add_epi16(S00, S01), 1); S33 = _mm_srai_epi16(_mm_add_epi16(S01, S02), 1); S34 = _mm_srai_epi16(_mm_add_epi16(S02, S03), 1); S35 = _mm_srai_epi16(_mm_add_epi16(S03, S04), 1); S36 = _mm_srai_epi16(_mm_add_epi16(S04, S05), 1); S37 = _mm_srai_epi16(_mm_add_epi16(S05, S06), 1); S38 = _mm_srai_epi16(_mm_add_epi16(S06, S07), 1); S39 = _mm_srai_epi16(_mm_add_epi16(S07, S08), 1); S40 = _mm_srai_epi16(_mm_add_epi16(S08, S09), 1); S41 = _mm_srai_epi16(_mm_add_epi16(S09, S10), 1); S42 = _mm_srai_epi16(_mm_add_epi16(S10, S11), 1); S43 = _mm_srai_epi16(_mm_add_epi16(S11, S12), 1); S44 = _mm_srai_epi16(_mm_add_epi16(S12, S13), 1); S45 = _mm_srai_epi16(_mm_add_epi16(S13, S14), 1); S46 = _mm_srai_epi16(_mm_add_epi16(S14, S15), 1); S47 = _mm_srai_epi16(_mm_add_epi16(S15, S16), 1); S48 = _mm_srai_epi16(_mm_add_epi16(S16, S17), 1); S49 = _mm_srai_epi16(_mm_add_epi16(S17, S18), 1); S50 = _mm_srai_epi16(_mm_add_epi16(S18, S19), 1); S51 = _mm_srai_epi16(_mm_add_epi16(S19, S20), 1); S52 = _mm_srai_epi16(_mm_add_epi16(S20, S21), 1); S53 = _mm_srai_epi16(_mm_add_epi16(S21, S22), 1); S54 = _mm_srai_epi16(_mm_add_epi16(S22, S23), 1); S55 = _mm_srai_epi16(_mm_add_epi16(S23, S24), 1); S56 = _mm_srai_epi16(_mm_add_epi16(S24, S25), 1); S57 = _mm_srai_epi16(_mm_add_epi16(S25, S26), 1); S58 = _mm_srai_epi16(_mm_add_epi16(S26, S27), 1); S59 = _mm_srai_epi16(_mm_add_epi16(S27, S28), 1); S60 = _mm_srai_epi16(_mm_add_epi16(S28, S29), 1); S61 = _mm_srai_epi16(_mm_add_epi16(S29, S30), 1); S62 = _mm_srai_epi16(_mm_add_epi16(S30, S31), 1); S63 = _mm_srai_epi16(_mm_add_epi16(S31, S31), 1); /*--transposition--*/ //8x64 -> 64x8 TRANSPOSE_8x8_16BIT(S00, S32, S01, S33, S02, S34, S03, S35, T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0]); TRANSPOSE_8x8_16BIT(S04, S36, S05, S37, S06, S38, S07, S39, T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1]); TRANSPOSE_8x8_16BIT(S08, S40, S09, S41, S10, S42, S11, S43, T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2]); TRANSPOSE_8x8_16BIT(S12, S44, S13, S45, S14, S46, S15, S47, T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3]); TRANSPOSE_8x8_16BIT(S16, S48, S17, S49, S18, S50, S19, S51, T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4]); TRANSPOSE_8x8_16BIT(S20, S52, S21, S53, S22, S54, S23, S55, T00[5], T01[5], T02[5], T03[5], T04[5], T05[5], T06[5], T07[5]); TRANSPOSE_8x8_16BIT(S24, S56, S25, S57, S26, S58, S27, S59, T00[6], T01[6], T02[6], T03[6], T04[6], T05[6], T06[6], T07[6]); TRANSPOSE_8x8_16BIT(S28, S60, S29, S61, S30, S62, S31, S63, T00[7], T01[7], T02[7], T03[7], T04[7], T05[7], T06[7], T07[7]); /*--horizontal transform--*/ for (i = 0; i < 8; i++) { T08[i] = _mm_srai_epi16(_mm_add_epi16(T00[i], T01[i]), 1); T09[i] = _mm_srai_epi16(_mm_add_epi16(T01[i], T02[i]), 1); T10[i] = _mm_srai_epi16(_mm_add_epi16(T02[i], T03[i]), 1); T11[i] = _mm_srai_epi16(_mm_add_epi16(T03[i], T04[i]), 1); T12[i] = _mm_srai_epi16(_mm_add_epi16(T04[i], T05[i]), 1); T13[i] = _mm_srai_epi16(_mm_add_epi16(T05[i], T06[i]), 1); T14[i] = _mm_srai_epi16(_mm_add_epi16(T06[i], T07[i]), 1); T15[i] = _mm_srai_epi16(_mm_add_epi16(T07[i], T07[i]), 1); } /*--transposition--*/ //64x16 -> 16x64 TRANSPOSE_8x8_16BIT(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0]); TRANSPOSE_8x8_16BIT(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_8x8_16BIT(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0]); TRANSPOSE_8x8_16BIT(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_8x8_16BIT(T00[4], T08[4], T01[4], T09[4], T02[4], T10[4], T03[4], T11[4], V32[0], V33[0], V34[0], V35[0], V36[0], V37[0], V38[0], V39[0]); TRANSPOSE_8x8_16BIT(T00[5], T08[5], T01[5], T09[5], T02[5], T10[5], T03[5], T11[5], V40[0], V41[0], V42[0], V43[0], V44[0], V45[0], V46[0], V47[0]); TRANSPOSE_8x8_16BIT(T00[6], T08[6], T01[6], T09[6], T02[6], T10[6], T03[6], T11[6], V48[0], V49[0], V50[0], V51[0], V52[0], V53[0], V54[0], V55[0]); TRANSPOSE_8x8_16BIT(T00[7], T08[7], T01[7], T09[7], T02[7], T10[7], T03[7], T11[7], V56[0], V57[0], V58[0], V59[0], V60[0], V61[0], V62[0], V63[0]); TRANSPOSE_8x8_16BIT(T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1]); TRANSPOSE_8x8_16BIT(T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_8x8_16BIT(T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1]); TRANSPOSE_8x8_16BIT(T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_8x8_16BIT(T04[4], T12[4], T05[4], T13[4], T06[4], T14[4], T07[4], T15[4], V32[1], V33[1], V34[1], V35[1], V36[1], V37[1], V38[1], V39[1]); TRANSPOSE_8x8_16BIT(T04[5], T12[5], T05[5], T13[5], T06[5], T14[5], T07[5], T15[5], V40[1], V41[1], V42[1], V43[1], V44[1], V45[1], V46[1], V47[1]); TRANSPOSE_8x8_16BIT(T04[6], T12[6], T05[6], T13[6], T06[6], T14[6], T07[6], T15[6], V48[1], V49[1], V50[1], V51[1], V52[1], V53[1], V54[1], V55[1]); TRANSPOSE_8x8_16BIT(T04[7], T12[7], T05[7], T13[7], T06[7], T14[7], T07[7], T15[7], V56[1], V57[1], V58[1], V59[1], V60[1], V61[1], V62[1], V63[1]); /*--Store--*/ //16x64 for (i = 0; i < 2; i++) { _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 0], V00[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 1], V01[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 2], V02[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 3], V03[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 4], V04[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 5], V05[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 6], V06[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 7], V07[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 8], V08[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 9], V09[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 10], V10[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 11], V11[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 12], V12[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 13], V13[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 14], V14[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 15], V15[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 16], V16[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 17], V17[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 18], V18[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 19], V19[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 20], V20[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 21], V21[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 22], V22[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 23], V23[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 24], V24[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 25], V25[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 26], V26[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 27], V27[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 28], V28[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 29], V29[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 30], V30[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 31], V31[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 32], V32[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 33], V33[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 34], V34[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 35], V35[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 36], V36[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 37], V37[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 38], V38[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 39], V39[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 40], V40[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 41], V41[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 42], V42[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 43], V43[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 44], V44[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 45], V45[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 46], V46[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 47], V47[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 48], V48[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 49], V49[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 50], V50[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 51], V51[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 52], V52[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 53], V53[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 54], V54[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 55], V55[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 56], V56[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 57], V57[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 58], V58[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 59], V59[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 60], V60[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 61], V61[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 62], V62[i]); _mm_storeu_si128((__m128i*)&coeff[8 * i + 16 * 63], V63[i]); } } /* --------------------------------------------------------------------------- */ void idct_64x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x32_sse128(src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_64x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x32_half_sse128(src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_64x64_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x32_quad_sse128(src, dst, 32 | 0x01); /* 32x32 idct */ inv_wavelet_64x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_64x16_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x8_sse128(src, dst, 32 | 0x01); inv_wavelet_64x16_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_64x16_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x8_half_sse128(src, dst, 32 | 0x01); inv_wavelet_64x16_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_64x16_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x8_quad_sse128(src, dst, 32 | 0x01); inv_wavelet_64x16_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_16x64_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_8x32_sse128(src, dst, 8 | 0x01); inv_wavelet_16x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_16x64_half_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_8x32_half_sse128(src, dst, 8 | 0x01); inv_wavelet_16x64_sse128(dst); } /* --------------------------------------------------------------------------- */ void idct_16x64_quad_sse128(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_8x32_quad_sse128(src, dst, 8 | 0x01); inv_wavelet_16x64_sse128(dst); } davs2-1.6/source/common/vec/intrinsic_idct_avx2.cc000066400000000000000000004414411337322544400222370ustar00rootroot00000000000000/* * intrinsic_idct_avx2.cc * * Description of this file: * AVX2 assembly functions of IDCT module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include /* disable warnings */ #pragma warning(disable:4127) // warning C4127: ʽdz ALIGN32(static const coeff_t tab_idct_8x8_256[12][16]) = { { 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38, 44, 38 }, { 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9, 25, 9 }, { 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9, 38, -9 }, { -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25, -44, -25 }, { 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44, 25, -44 }, { 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38, 9, 38 }, { 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25, 9, -25 }, { 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44, 38, -44 }, { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 }, { 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32, 32, -32 }, { 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17, 42, 17 }, { 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42, 17, -42 } }; void idct_8x8_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { const int SHIFT1 = 5; // const int CLIP1 = LIMIT_BIT; const int SHIFT2 = 20 - g_bit_depth; const int CLIP2 = g_bit_depth + 1; __m256i mAdd; __m256i S1S5, S3S7; __m256i T0, T1, T2, T3; __m256i E0, E1, E2, E3, O0, O1, O2, O3; __m256i EE0, EE1, EO0, EO1; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i C00, C01, C02, C03, C04, C05, C06, C07; __m256i max_val, min_val; UNUSED_PARAMETER(i_dst); S1S5 = _mm256_loadu2_m128i((__m128i*)&src[40], (__m128i*)&src[ 8]); S3S7 = _mm256_loadu2_m128i((__m128i*)&src[56], (__m128i*)&src[24]); T0 = _mm256_unpacklo_epi16(S1S5, S3S7); T1 = _mm256_unpackhi_epi16(S1S5, S3S7); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); O0 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[0]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[1])))); O1 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[2]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[3])))); O2 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[4]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[5])))); O3 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[6]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[7])))); /* ------- */ S1S5 = _mm256_loadu2_m128i((__m128i*)&src[16], (__m128i*)&src[0]); S3S7 = _mm256_loadu2_m128i((__m128i*)&src[48], (__m128i*)&src[32]); T0 = _mm256_unpacklo_epi16(S1S5, S3S7); T1 = _mm256_unpackhi_epi16(S1S5, S3S7); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); EE0 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[8]))); EE1 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[9]))); EO0 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[10]))); EO1 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[11]))); /* ------- */ mAdd = _mm256_set1_epi32((1 << (SHIFT1 - 1))); // ״η任 E0 = _mm256_add_epi32(EE0, EO0); E1 = _mm256_add_epi32(EE1, EO1); E3 = _mm256_sub_epi32(EE0, EO0); E2 = _mm256_sub_epi32(EE1, EO1); E0 = _mm256_add_epi32(E0, mAdd); E1 = _mm256_add_epi32(E1, mAdd); E2 = _mm256_add_epi32(E2, mAdd); E3 = _mm256_add_epi32(E3, mAdd); S0 = _mm256_srai_epi32(_mm256_add_epi32(E0, O0), SHIFT1); S7 = _mm256_srai_epi32(_mm256_sub_epi32(E0, O0), SHIFT1); S1 = _mm256_srai_epi32(_mm256_add_epi32(E1, O1), SHIFT1); S6 = _mm256_srai_epi32(_mm256_sub_epi32(E1, O1), SHIFT1); S2 = _mm256_srai_epi32(_mm256_add_epi32(E2, O2), SHIFT1); S5 = _mm256_srai_epi32(_mm256_sub_epi32(E2, O2), SHIFT1); S3 = _mm256_srai_epi32(_mm256_add_epi32(E3, O3), SHIFT1); S4 = _mm256_srai_epi32(_mm256_sub_epi32(E3, O3), SHIFT1); C00 = _mm256_permute2x128_si256(S0, S4, 0x20); C01 = _mm256_permute2x128_si256(S0, S4, 0x31); C02 = _mm256_permute2x128_si256(S1, S5, 0x20); C03 = _mm256_permute2x128_si256(S1, S5, 0x31); C04 = _mm256_permute2x128_si256(S2, S6, 0x20); C05 = _mm256_permute2x128_si256(S2, S6, 0x31); C06 = _mm256_permute2x128_si256(S3, S7, 0x20); C07 = _mm256_permute2x128_si256(S3, S7, 0x31); S0 = _mm256_packs_epi32(C00, C01); S1 = _mm256_packs_epi32(C02, C03); S2 = _mm256_packs_epi32(C04, C05); S3 = _mm256_packs_epi32(C06, C07); S4 = _mm256_unpacklo_epi16(S0, S1); S5 = _mm256_unpacklo_epi16(S2, S3); S6 = _mm256_unpackhi_epi16(S0, S1); S7 = _mm256_unpackhi_epi16(S2, S3); C00 = _mm256_unpacklo_epi32(S4, S5); C01 = _mm256_unpacklo_epi32(S6, S7); C02 = _mm256_unpackhi_epi32(S4, S5); C03 = _mm256_unpackhi_epi32(S6, S7); C04 = _mm256_permute2x128_si256(C00, C02, 0x20); C05 = _mm256_permute2x128_si256(C00, C02, 0x31); C06 = _mm256_permute2x128_si256(C01, C03, 0x20); C07 = _mm256_permute2x128_si256(C01, C03, 0x31); S0 = _mm256_unpacklo_epi64(C04, C05); S1 = _mm256_unpacklo_epi64(C06, C07); S2 = _mm256_unpackhi_epi64(C04, C05); S3 = _mm256_unpackhi_epi64(C06, C07); S4 = _mm256_permute2x128_si256(S2, S3, 0x20); S5 = _mm256_permute2x128_si256(S2, S3, 0x31); T0 = _mm256_unpacklo_epi16(S4, S5); T1 = _mm256_unpackhi_epi16(S4, S5); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); O0 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[0]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[1])))); O1 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[2]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[3])))); O2 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[4]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[5])))); O3 = _mm256_add_epi32(_mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[6]))), _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[7])))); /* ------- */ T0 = _mm256_unpacklo_epi16(S0, S1); T1 = _mm256_unpackhi_epi16(S0, S1); T2 = _mm256_permute2x128_si256(T0, T1, 0x20); T3 = _mm256_permute2x128_si256(T0, T1, 0x31); EE0 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[8]))); EE1 = _mm256_madd_epi16(T2, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[9]))); EO0 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[10]))); EO1 = _mm256_madd_epi16(T3, _mm256_load_si256((__m256i*)(tab_idct_8x8_256[11]))); /* ------- */ mAdd = _mm256_set1_epi32(SHIFT2 ? (1 << (SHIFT2 - 1)) : 0); // E0 = _mm256_add_epi32(EE0, EO0); E1 = _mm256_add_epi32(EE1, EO1); E3 = _mm256_sub_epi32(EE0, EO0); E2 = _mm256_sub_epi32(EE1, EO1); E0 = _mm256_add_epi32(E0, mAdd); E1 = _mm256_add_epi32(E1, mAdd); E2 = _mm256_add_epi32(E2, mAdd); E3 = _mm256_add_epi32(E3, mAdd); S0 = _mm256_srai_epi32(_mm256_add_epi32(E0, O0), SHIFT2); S7 = _mm256_srai_epi32(_mm256_sub_epi32(E0, O0), SHIFT2); S1 = _mm256_srai_epi32(_mm256_add_epi32(E1, O1), SHIFT2); S6 = _mm256_srai_epi32(_mm256_sub_epi32(E1, O1), SHIFT2); S2 = _mm256_srai_epi32(_mm256_add_epi32(E2, O2), SHIFT2); S5 = _mm256_srai_epi32(_mm256_sub_epi32(E2, O2), SHIFT2); S3 = _mm256_srai_epi32(_mm256_add_epi32(E3, O3), SHIFT2); S4 = _mm256_srai_epi32(_mm256_sub_epi32(E3, O3), SHIFT2); C00 = _mm256_permute2x128_si256(S0, S4, 0x20); C01 = _mm256_permute2x128_si256(S0, S4, 0x31); C02 = _mm256_permute2x128_si256(S1, S5, 0x20); C03 = _mm256_permute2x128_si256(S1, S5, 0x31); C04 = _mm256_permute2x128_si256(S2, S6, 0x20); C05 = _mm256_permute2x128_si256(S2, S6, 0x31); C06 = _mm256_permute2x128_si256(S3, S7, 0x20); C07 = _mm256_permute2x128_si256(S3, S7, 0x31); S0 = _mm256_packs_epi32(C00, C01); S1 = _mm256_packs_epi32(C02, C03); S2 = _mm256_packs_epi32(C04, C05); S3 = _mm256_packs_epi32(C06, C07); S4 = _mm256_unpacklo_epi16(S0, S1); S5 = _mm256_unpacklo_epi16(S2, S3); S6 = _mm256_unpackhi_epi16(S0, S1); S7 = _mm256_unpackhi_epi16(S2, S3); C00 = _mm256_unpacklo_epi32(S4, S5); C01 = _mm256_unpacklo_epi32(S6, S7); C02 = _mm256_unpackhi_epi32(S4, S5); C03 = _mm256_unpackhi_epi32(S6, S7); C04 = _mm256_permute2x128_si256(C00, C02, 0x20); C05 = _mm256_permute2x128_si256(C00, C02, 0x31); C06 = _mm256_permute2x128_si256(C01, C03, 0x20); C07 = _mm256_permute2x128_si256(C01, C03, 0x31); S0 = _mm256_unpacklo_epi64(C04, C05); S1 = _mm256_unpacklo_epi64(C06, C07); S2 = _mm256_unpackhi_epi64(C04, C05); S3 = _mm256_unpackhi_epi64(C06, C07); // CLIP2 max_val = _mm256_set1_epi16((1 << (CLIP2 - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (CLIP2 - 1))); S0 = _mm256_max_epi16(_mm256_min_epi16(S0, max_val), min_val); S1 = _mm256_max_epi16(_mm256_min_epi16(S1, max_val), min_val); S2 = _mm256_max_epi16(_mm256_min_epi16(S2, max_val), min_val); S3 = _mm256_max_epi16(_mm256_min_epi16(S3, max_val), min_val); // store _mm256_storeu2_m128i((__m128i*)&dst[16], (__m128i*)&dst[ 0], S0); _mm256_storeu2_m128i((__m128i*)&dst[48], (__m128i*)&dst[32], S1); _mm256_storeu2_m128i((__m128i*)&dst[24], (__m128i*)&dst[ 8], S2); _mm256_storeu2_m128i((__m128i*)&dst[56], (__m128i*)&dst[40], S3); } void idct_16x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { const int shift = 20-g_bit_depth; const int clip = g_bit_depth + 1; const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); //row0 87high - 90low address const __m256i c16_p35_p40 = _mm256_set1_epi32(0x00230028); const __m256i c16_p21_p29 = _mm256_set1_epi32(0x0015001D); const __m256i c16_p04_p13 = _mm256_set1_epi32(0x0004000D); const __m256i c16_p29_p43 = _mm256_set1_epi32(0x001D002B); //row1 const __m256i c16_n21_p04 = _mm256_set1_epi32(0xFFEB0004); const __m256i c16_n45_n40 = _mm256_set1_epi32(0xFFD3FFD8); const __m256i c16_n13_n35 = _mm256_set1_epi32(0xFFF3FFDD); const __m256i c16_p04_p40 = _mm256_set1_epi32(0x00040028); //row2 const __m256i c16_n43_n35 = _mm256_set1_epi32(0xFFD5FFDD); const __m256i c16_p29_n13 = _mm256_set1_epi32(0x001DFFF3); const __m256i c16_p21_p45 = _mm256_set1_epi32(0x0015002D); const __m256i c16_n21_p35 = _mm256_set1_epi32(0xFFEB0023); //row3 const __m256i c16_p04_n43 = _mm256_set1_epi32(0x0004FFD5); const __m256i c16_p13_p45 = _mm256_set1_epi32(0x000D002D); const __m256i c16_n29_n40 = _mm256_set1_epi32(0xFFE3FFD8); const __m256i c16_n40_p29 = _mm256_set1_epi32(0xFFD8001D); //row4 const __m256i c16_p45_n13 = _mm256_set1_epi32(0x002DFFF3); const __m256i c16_n43_n04 = _mm256_set1_epi32(0xFFD5FFFC); const __m256i c16_p35_p21 = _mm256_set1_epi32(0x00230015); const __m256i c16_n45_p21 = _mm256_set1_epi32(0xFFD30015); //row5 const __m256i c16_p13_p29 = _mm256_set1_epi32(0x000D001D); const __m256i c16_p35_n43 = _mm256_set1_epi32(0x0023FFD5); const __m256i c16_n40_p04 = _mm256_set1_epi32(0xFFD80004); const __m256i c16_n35_p13 = _mm256_set1_epi32(0xFFDD000D); //row6 const __m256i c16_n40_p45 = _mm256_set1_epi32(0xFFD8002D); const __m256i c16_p04_p21 = _mm256_set1_epi32(0x00040015); const __m256i c16_p43_n29 = _mm256_set1_epi32(0x002BFFE3); const __m256i c16_n13_p04 = _mm256_set1_epi32(0xFFF30004); //row7 const __m256i c16_n29_p21 = _mm256_set1_epi32(0xFFE30015); const __m256i c16_n40_p35 = _mm256_set1_epi32(0xFFD80023); const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_p38_p44 = _mm256_set1_epi32(0x0026002C); const __m256i c16_p09_p25 = _mm256_set1_epi32(0x00090019); const __m256i c16_n09_p38 = _mm256_set1_epi32(0xFFF70026); const __m256i c16_n25_n44 = _mm256_set1_epi32(0xFFE7FFD4); const __m256i c16_n44_p25 = _mm256_set1_epi32(0xFFD40019); const __m256i c16_p38_p09 = _mm256_set1_epi32(0x00260009); const __m256i c16_n25_p09 = _mm256_set1_epi32(0xFFE70009); const __m256i c16_n44_p38 = _mm256_set1_epi32(0xFFD40026); const __m256i c16_p17_p42 = _mm256_set1_epi32(0x0011002A); const __m256i c16_n42_p17 = _mm256_set1_epi32(0xFFD60011); const __m256i c16_n32_p32 = _mm256_set1_epi32(0xFFE00020); const __m256i c16_p32_p32 = _mm256_set1_epi32(0x00200020); __m256i max_val, min_val; __m256i c32_rnd = _mm256_set1_epi32(16); // һ int nShift = 5; int pass; __m256i in00, in01, in02, in03, in04, in05, in06, in07; __m256i in08, in09, in10, in11, in12, in13, in14, in15; __m256i res00, res01, res02, res03, res04, res05, res06, res07; __m256i res08, res09, res10, res11, res12, res13, res14, res15; UNUSED_PARAMETER(i_dst); in00 = _mm256_lddqu_si256((const __m256i*)&src[0 * 16]); // [07 06 05 04 03 02 01 00] in01 = _mm256_lddqu_si256((const __m256i*)&src[1 * 16]); // [17 16 15 14 13 12 11 10] in02 = _mm256_lddqu_si256((const __m256i*)&src[2 * 16]); // [27 26 25 24 23 22 21 20] in03 = _mm256_lddqu_si256((const __m256i*)&src[3 * 16]); // [37 36 35 34 33 32 31 30] in04 = _mm256_lddqu_si256((const __m256i*)&src[4 * 16]); // [47 46 45 44 43 42 41 40] in05 = _mm256_lddqu_si256((const __m256i*)&src[5 * 16]); // [57 56 55 54 53 52 51 50] in06 = _mm256_lddqu_si256((const __m256i*)&src[6 * 16]); // [67 66 65 64 63 62 61 60] in07 = _mm256_lddqu_si256((const __m256i*)&src[7 * 16]); // [77 76 75 74 73 72 71 70] in08 = _mm256_lddqu_si256((const __m256i*)&src[8 * 16]); in09 = _mm256_lddqu_si256((const __m256i*)&src[9 * 16]); in10 = _mm256_lddqu_si256((const __m256i*)&src[10 * 16]); in11 = _mm256_lddqu_si256((const __m256i*)&src[11 * 16]); in12 = _mm256_lddqu_si256((const __m256i*)&src[12 * 16]); in13 = _mm256_lddqu_si256((const __m256i*)&src[13 * 16]); in14 = _mm256_lddqu_si256((const __m256i*)&src[14 * 16]); in15 = _mm256_lddqu_si256((const __m256i*)&src[15 * 16]); for (pass = 0; pass < 2; pass++) { const __m256i T_00_00A = _mm256_unpacklo_epi16(in01, in03); // [33 13 32 12 31 11 30 10] const __m256i T_00_00B = _mm256_unpackhi_epi16(in01, in03); // [37 17 36 16 35 15 34 14] const __m256i T_00_01A = _mm256_unpacklo_epi16(in05, in07); // [ ] const __m256i T_00_01B = _mm256_unpackhi_epi16(in05, in07); // [ ] const __m256i T_00_02A = _mm256_unpacklo_epi16(in09, in11); // [ ] const __m256i T_00_02B = _mm256_unpackhi_epi16(in09, in11); // [ ] const __m256i T_00_03A = _mm256_unpacklo_epi16(in13, in15); // [ ] const __m256i T_00_03B = _mm256_unpackhi_epi16(in13, in15); // [ ] const __m256i T_00_04A = _mm256_unpacklo_epi16(in02, in06); // [ ] const __m256i T_00_04B = _mm256_unpackhi_epi16(in02, in06); // [ ] const __m256i T_00_05A = _mm256_unpacklo_epi16(in10, in14); // [ ] const __m256i T_00_05B = _mm256_unpackhi_epi16(in10, in14); // [ ] const __m256i T_00_06A = _mm256_unpacklo_epi16(in04, in12); // [ ]row const __m256i T_00_06B = _mm256_unpackhi_epi16(in04, in12); // [ ] const __m256i T_00_07A = _mm256_unpacklo_epi16(in00, in08); // [83 03 82 02 81 01 81 00] row08 row00 const __m256i T_00_07B = _mm256_unpackhi_epi16(in00, in08); // [87 07 86 06 85 05 84 04] __m256i O0A, O1A, O2A, O3A, O4A, O5A, O6A, O7A; __m256i O0B, O1B, O2B, O3B, O4B, O5B, O6B, O7B; __m256i EO0A, EO1A, EO2A, EO3A; __m256i EO0B, EO1B, EO2B, EO3B; __m256i EEO0A, EEO1A; __m256i EEO0B, EEO1B; __m256i EEE0A, EEE1A; __m256i EEE0B, EEE1B; { __m256i T00, T01; #define COMPUTE_ROW(row0103, row0507, row0911, row1315, c0103, c0507, c0911, c1315, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(row0103, c0103), _mm256_madd_epi16(row0507, c0507)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(row0911, c0911), _mm256_madd_epi16(row1315, c1315)); \ row = _mm256_add_epi32(T00, T01); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, O0B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, O1B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, O2B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, O3B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, O4B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, O5B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, O6B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, O7B) #undef COMPUTE_ROW } EO0A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_p38_p44), _mm256_madd_epi16(T_00_05A, c16_p09_p25)); // EO0 EO0B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_p38_p44), _mm256_madd_epi16(T_00_05B, c16_p09_p25)); EO1A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n09_p38), _mm256_madd_epi16(T_00_05A, c16_n25_n44)); // EO1 EO1B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n09_p38), _mm256_madd_epi16(T_00_05B, c16_n25_n44)); EO2A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n44_p25), _mm256_madd_epi16(T_00_05A, c16_p38_p09)); // EO2 EO2B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n44_p25), _mm256_madd_epi16(T_00_05B, c16_p38_p09)); EO3A = _mm256_add_epi32(_mm256_madd_epi16(T_00_04A, c16_n25_p09), _mm256_madd_epi16(T_00_05A, c16_n44_p38)); // EO3 EO3B = _mm256_add_epi32(_mm256_madd_epi16(T_00_04B, c16_n25_p09), _mm256_madd_epi16(T_00_05B, c16_n44_p38)); EEO0A = _mm256_madd_epi16(T_00_06A, c16_p17_p42); EEO0B = _mm256_madd_epi16(T_00_06B, c16_p17_p42); EEO1A = _mm256_madd_epi16(T_00_06A, c16_n42_p17); EEO1B = _mm256_madd_epi16(T_00_06B, c16_n42_p17); EEE0A = _mm256_madd_epi16(T_00_07A, c16_p32_p32); EEE0B = _mm256_madd_epi16(T_00_07B, c16_p32_p32); EEE1A = _mm256_madd_epi16(T_00_07A, c16_n32_p32); EEE1B = _mm256_madd_epi16(T_00_07B, c16_n32_p32); { const __m256i EE0A = _mm256_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 const __m256i EE0B = _mm256_add_epi32(EEE0B, EEO0B); const __m256i EE1A = _mm256_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 const __m256i EE1B = _mm256_add_epi32(EEE1B, EEO1B); const __m256i EE3A = _mm256_sub_epi32(EEE0A, EEO0A); // EE2 = EEE0 - EEO0 const __m256i EE3B = _mm256_sub_epi32(EEE0B, EEO0B); const __m256i EE2A = _mm256_sub_epi32(EEE1A, EEO1A); // EE3 = EEE1 - EEO1 const __m256i EE2B = _mm256_sub_epi32(EEE1B, EEO1B); const __m256i E0A = _mm256_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 const __m256i E0B = _mm256_add_epi32(EE0B, EO0B); const __m256i E1A = _mm256_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 const __m256i E1B = _mm256_add_epi32(EE1B, EO1B); const __m256i E2A = _mm256_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 const __m256i E2B = _mm256_add_epi32(EE2B, EO2B); const __m256i E3A = _mm256_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 const __m256i E3B = _mm256_add_epi32(EE3B, EO3B); const __m256i E7A = _mm256_sub_epi32(EE0A, EO0A); // E7 = EE0 - EO0 const __m256i E7B = _mm256_sub_epi32(EE0B, EO0B); const __m256i E6A = _mm256_sub_epi32(EE1A, EO1A); // E6 = EE1 - EO1 const __m256i E6B = _mm256_sub_epi32(EE1B, EO1B); const __m256i E5A = _mm256_sub_epi32(EE2A, EO2A); // E5 = EE2 - EO2 const __m256i E5B = _mm256_sub_epi32(EE2B, EO2B); const __m256i E4A = _mm256_sub_epi32(EE3A, EO3A); // E4 = EE3 - EO3 const __m256i E4B = _mm256_sub_epi32(EE3B, EO3B); const __m256i T10A = _mm256_add_epi32(E0A, c32_rnd); // E0 + rnd const __m256i T10B = _mm256_add_epi32(E0B, c32_rnd); const __m256i T11A = _mm256_add_epi32(E1A, c32_rnd); // E1 + rnd const __m256i T11B = _mm256_add_epi32(E1B, c32_rnd); const __m256i T12A = _mm256_add_epi32(E2A, c32_rnd); // E2 + rnd const __m256i T12B = _mm256_add_epi32(E2B, c32_rnd); const __m256i T13A = _mm256_add_epi32(E3A, c32_rnd); // E3 + rnd const __m256i T13B = _mm256_add_epi32(E3B, c32_rnd); const __m256i T14A = _mm256_add_epi32(E4A, c32_rnd); // E4 + rnd const __m256i T14B = _mm256_add_epi32(E4B, c32_rnd); const __m256i T15A = _mm256_add_epi32(E5A, c32_rnd); // E5 + rnd const __m256i T15B = _mm256_add_epi32(E5B, c32_rnd); const __m256i T16A = _mm256_add_epi32(E6A, c32_rnd); // E6 + rnd const __m256i T16B = _mm256_add_epi32(E6B, c32_rnd); const __m256i T17A = _mm256_add_epi32(E7A, c32_rnd); // E7 + rnd const __m256i T17B = _mm256_add_epi32(E7B, c32_rnd); const __m256i T20A = _mm256_add_epi32(T10A, O0A); // E0 + O0 + rnd const __m256i T20B = _mm256_add_epi32(T10B, O0B); const __m256i T21A = _mm256_add_epi32(T11A, O1A); // E1 + O1 + rnd const __m256i T21B = _mm256_add_epi32(T11B, O1B); const __m256i T22A = _mm256_add_epi32(T12A, O2A); // E2 + O2 + rnd const __m256i T22B = _mm256_add_epi32(T12B, O2B); const __m256i T23A = _mm256_add_epi32(T13A, O3A); // E3 + O3 + rnd const __m256i T23B = _mm256_add_epi32(T13B, O3B); const __m256i T24A = _mm256_add_epi32(T14A, O4A); // E4 const __m256i T24B = _mm256_add_epi32(T14B, O4B); const __m256i T25A = _mm256_add_epi32(T15A, O5A); // E5 const __m256i T25B = _mm256_add_epi32(T15B, O5B); const __m256i T26A = _mm256_add_epi32(T16A, O6A); // E6 const __m256i T26B = _mm256_add_epi32(T16B, O6B); const __m256i T27A = _mm256_add_epi32(T17A, O7A); // E7 const __m256i T27B = _mm256_add_epi32(T17B, O7B); const __m256i T2FA = _mm256_sub_epi32(T10A, O0A); // E0 - O0 + rnd const __m256i T2FB = _mm256_sub_epi32(T10B, O0B); const __m256i T2EA = _mm256_sub_epi32(T11A, O1A); // E1 - O1 + rnd const __m256i T2EB = _mm256_sub_epi32(T11B, O1B); const __m256i T2DA = _mm256_sub_epi32(T12A, O2A); // E2 - O2 + rnd const __m256i T2DB = _mm256_sub_epi32(T12B, O2B); const __m256i T2CA = _mm256_sub_epi32(T13A, O3A); // E3 - O3 + rnd const __m256i T2CB = _mm256_sub_epi32(T13B, O3B); const __m256i T2BA = _mm256_sub_epi32(T14A, O4A); // E4 const __m256i T2BB = _mm256_sub_epi32(T14B, O4B); const __m256i T2AA = _mm256_sub_epi32(T15A, O5A); // E5 const __m256i T2AB = _mm256_sub_epi32(T15B, O5B); const __m256i T29A = _mm256_sub_epi32(T16A, O6A); // E6 const __m256i T29B = _mm256_sub_epi32(T16B, O6B); const __m256i T28A = _mm256_sub_epi32(T17A, O7A); // E7 const __m256i T28B = _mm256_sub_epi32(T17B, O7B); const __m256i T30A = _mm256_srai_epi32(T20A, nShift); // [30 20 10 00] // This operation make it much slower than 128 const __m256i T30B = _mm256_srai_epi32(T20B, nShift); // [70 60 50 40] // This operation make it much slower than 128 const __m256i T31A = _mm256_srai_epi32(T21A, nShift); // [31 21 11 01] // This operation make it much slower than 128 const __m256i T31B = _mm256_srai_epi32(T21B, nShift); // [71 61 51 41] // This operation make it much slower than 128 const __m256i T32A = _mm256_srai_epi32(T22A, nShift); // [32 22 12 02] // This operation make it much slower than 128 const __m256i T32B = _mm256_srai_epi32(T22B, nShift); // [72 62 52 42] // This operation make it much slower than 128 const __m256i T33A = _mm256_srai_epi32(T23A, nShift); // [33 23 13 03] // This operation make it much slower than 128 const __m256i T33B = _mm256_srai_epi32(T23B, nShift); // [73 63 53 43] // This operation make it much slower than 128 const __m256i T34A = _mm256_srai_epi32(T24A, nShift); // [33 24 14 04] // This operation make it much slower than 128 const __m256i T34B = _mm256_srai_epi32(T24B, nShift); // [74 64 54 44] // This operation make it much slower than 128 const __m256i T35A = _mm256_srai_epi32(T25A, nShift); // [35 25 15 05] // This operation make it much slower than 128 const __m256i T35B = _mm256_srai_epi32(T25B, nShift); // [75 65 55 45] // This operation make it much slower than 128 const __m256i T36A = _mm256_srai_epi32(T26A, nShift); // [36 26 16 06] // This operation make it much slower than 128 const __m256i T36B = _mm256_srai_epi32(T26B, nShift); // [76 66 56 46] // This operation make it much slower than 128 const __m256i T37A = _mm256_srai_epi32(T27A, nShift); // [37 27 17 07] // This operation make it much slower than 128 const __m256i T37B = _mm256_srai_epi32(T27B, nShift); // [77 67 57 47] // This operation make it much slower than 128 const __m256i T38A = _mm256_srai_epi32(T28A, nShift); // [30 20 10 00] x8 // This operation make it much slower than 128 const __m256i T38B = _mm256_srai_epi32(T28B, nShift); // [70 60 50 40] const __m256i T39A = _mm256_srai_epi32(T29A, nShift); // [31 21 11 01] x9 // This operation make it much slower than 128 const __m256i T39B = _mm256_srai_epi32(T29B, nShift); // [71 61 51 41] const __m256i T3AA = _mm256_srai_epi32(T2AA, nShift); // [32 22 12 02] xA // This operation make it much slower than 128 const __m256i T3AB = _mm256_srai_epi32(T2AB, nShift); // [72 62 52 42] const __m256i T3BA = _mm256_srai_epi32(T2BA, nShift); // [33 23 13 03] xB // This operation make it much slower than 128 const __m256i T3BB = _mm256_srai_epi32(T2BB, nShift); // [73 63 53 43] const __m256i T3CA = _mm256_srai_epi32(T2CA, nShift); // [33 24 14 04] xC // This operation make it much slower than 128 const __m256i T3CB = _mm256_srai_epi32(T2CB, nShift); // [74 64 54 44] const __m256i T3DA = _mm256_srai_epi32(T2DA, nShift); // [35 25 15 05] xD // This operation make it much slower than 128 const __m256i T3DB = _mm256_srai_epi32(T2DB, nShift); // [75 65 55 45] const __m256i T3EA = _mm256_srai_epi32(T2EA, nShift); // [36 26 16 06] xE // This operation make it much slower than 128 const __m256i T3EB = _mm256_srai_epi32(T2EB, nShift); // [76 66 56 46] const __m256i T3FA = _mm256_srai_epi32(T2FA, nShift); // [37 27 17 07] xF // This operation make it much slower than 128 const __m256i T3FB = _mm256_srai_epi32(T2FB, nShift); // [77 67 57 47] res00 = _mm256_packs_epi32(T30A, T30B); // [70 60 50 40 30 20 10 00] res01 = _mm256_packs_epi32(T31A, T31B); // [71 61 51 41 31 21 11 01] res02 = _mm256_packs_epi32(T32A, T32B); // [72 62 52 42 32 22 12 02] res03 = _mm256_packs_epi32(T33A, T33B); // [73 63 53 43 33 23 13 03] res04 = _mm256_packs_epi32(T34A, T34B); // [74 64 54 44 34 24 14 04] res05 = _mm256_packs_epi32(T35A, T35B); // [75 65 55 45 35 25 15 05] res06 = _mm256_packs_epi32(T36A, T36B); // [76 66 56 46 36 26 16 06] res07 = _mm256_packs_epi32(T37A, T37B); // [77 67 57 47 37 27 17 07] res08 = _mm256_packs_epi32(T38A, T38B); // [A0 ... 80] res09 = _mm256_packs_epi32(T39A, T39B); // [A1 ... 81] res10 = _mm256_packs_epi32(T3AA, T3AB); // [A2 ... 82] res11 = _mm256_packs_epi32(T3BA, T3BB); // [A3 ... 83] res12 = _mm256_packs_epi32(T3CA, T3CB); // [A4 ... 84] res13 = _mm256_packs_epi32(T3DA, T3DB); // [A5 ... 85] res14 = _mm256_packs_epi32(T3EA, T3EB); // [A6 ... 86] res15 = _mm256_packs_epi32(T3FA, T3FB); // [A7 ... 87] } //transpose matrix 16x16 16bit. { __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7, tr0_8, tr0_9, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; #define TRANSPOSE_16x16_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpacklo_epi16(I4, I5); \ tr0_3 = _mm256_unpacklo_epi16(I6, I7); \ tr0_4 = _mm256_unpacklo_epi16(I8, I9); \ tr0_5 = _mm256_unpacklo_epi16(I10, I11); \ tr0_6 = _mm256_unpacklo_epi16(I12, I13); \ tr0_7 = _mm256_unpacklo_epi16(I14, I15); \ tr0_8 = _mm256_unpackhi_epi16(I0, I1); \ tr0_9 = _mm256_unpackhi_epi16(I2, I3); \ tr0_10 = _mm256_unpackhi_epi16(I4, I5); \ tr0_11 = _mm256_unpackhi_epi16(I6, I7); \ tr0_12 = _mm256_unpackhi_epi16(I8, I9); \ tr0_13 = _mm256_unpackhi_epi16(I10, I11); \ tr0_14 = _mm256_unpackhi_epi16(I12, I13); \ tr0_15 = _mm256_unpackhi_epi16(I14, I15); \ O0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ O1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ O2 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ O3 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ O4 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ O5 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ O6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ O7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O8 = _mm256_unpacklo_epi32(tr0_8, tr0_9); \ O9 = _mm256_unpacklo_epi32(tr0_10, tr0_11); \ O10 = _mm256_unpacklo_epi32(tr0_12, tr0_13); \ O11 = _mm256_unpacklo_epi32(tr0_14, tr0_15); \ O12 = _mm256_unpackhi_epi32(tr0_8, tr0_9); \ O13 = _mm256_unpackhi_epi32(tr0_10, tr0_11); \ O14 = _mm256_unpackhi_epi32(tr0_12, tr0_13); \ O15 = _mm256_unpackhi_epi32(tr0_14, tr0_15); \ tr0_0 = _mm256_unpacklo_epi64(O0, O1); \ tr0_1 = _mm256_unpacklo_epi64(O2, O3); \ tr0_2 = _mm256_unpackhi_epi64(O0, O1); \ tr0_3 = _mm256_unpackhi_epi64(O2, O3); \ tr0_4 = _mm256_unpacklo_epi64(O4, O5); \ tr0_5 = _mm256_unpacklo_epi64(O6, O7); \ tr0_6 = _mm256_unpackhi_epi64(O4, O5); \ tr0_7 = _mm256_unpackhi_epi64(O6, O7); \ tr0_8 = _mm256_unpacklo_epi64(O8, O9); \ tr0_9 = _mm256_unpacklo_epi64(O10, O11); \ tr0_10 = _mm256_unpackhi_epi64(O8, O9); \ tr0_11 = _mm256_unpackhi_epi64(O10, O11); \ tr0_12 = _mm256_unpacklo_epi64(O12, O13); \ tr0_13 = _mm256_unpacklo_epi64(O14, O15); \ tr0_14 = _mm256_unpackhi_epi64(O12, O13); \ tr0_15 = _mm256_unpackhi_epi64(O14, O15); \ O0 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x20); \ O1 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x20); \ O2 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x20); \ O3 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x20); \ O4 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x20); \ O5 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x20); \ O6 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x20); \ O7 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x20); \ O8 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x31); \ O9 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x31); \ O10 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x31); \ O11 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x31); \ O12 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x31); \ O13 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x31); \ O14 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x31); \ O15 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x31); \ TRANSPOSE_16x16_16BIT(res00, res01, res02, res03, res04, res05, res06, res07, res08, res09, res10, res11, res12, res13, res14, res15, in00, in01, in02, in03, in04, in05, in06, in07, in08, in09, in10, in11, in12, in13, in14, in15) #undef TRANSPOSE_16x16_16BIT } nShift = shift; c32_rnd = _mm256_set1_epi32(shift ? (1 << (shift - 1)) : 0); // pass == 1 ڶ } // clip max_val = _mm256_set1_epi16((1 << (clip - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (clip - 1))); in00 = _mm256_max_epi16(_mm256_min_epi16(in00, max_val), min_val); in01 = _mm256_max_epi16(_mm256_min_epi16(in01, max_val), min_val); in02 = _mm256_max_epi16(_mm256_min_epi16(in02, max_val), min_val); in03 = _mm256_max_epi16(_mm256_min_epi16(in03, max_val), min_val); in04 = _mm256_max_epi16(_mm256_min_epi16(in04, max_val), min_val); in05 = _mm256_max_epi16(_mm256_min_epi16(in05, max_val), min_val); in06 = _mm256_max_epi16(_mm256_min_epi16(in06, max_val), min_val); in07 = _mm256_max_epi16(_mm256_min_epi16(in07, max_val), min_val); in08 = _mm256_max_epi16(_mm256_min_epi16(in08, max_val), min_val); in09 = _mm256_max_epi16(_mm256_min_epi16(in09, max_val), min_val); in10 = _mm256_max_epi16(_mm256_min_epi16(in10, max_val), min_val); in11 = _mm256_max_epi16(_mm256_min_epi16(in11, max_val), min_val); in12 = _mm256_max_epi16(_mm256_min_epi16(in12, max_val), min_val); in13 = _mm256_max_epi16(_mm256_min_epi16(in13, max_val), min_val); in14 = _mm256_max_epi16(_mm256_min_epi16(in14, max_val), min_val); in15 = _mm256_max_epi16(_mm256_min_epi16(in15, max_val), min_val); // store _mm256_storeu_si256((__m256i*)&dst[0 * 16 + 0], in00); _mm256_storeu_si256((__m256i*)&dst[1 * 16 + 0], in01); _mm256_storeu_si256((__m256i*)&dst[2 * 16 + 0], in02); _mm256_storeu_si256((__m256i*)&dst[3 * 16 + 0], in03); _mm256_storeu_si256((__m256i*)&dst[4 * 16 + 0], in04); _mm256_storeu_si256((__m256i*)&dst[5 * 16 + 0], in05); _mm256_storeu_si256((__m256i*)&dst[6 * 16 + 0], in06); _mm256_storeu_si256((__m256i*)&dst[7 * 16 + 0], in07); _mm256_storeu_si256((__m256i*)&dst[8 * 16 + 0], in08); _mm256_storeu_si256((__m256i*)&dst[9 * 16 + 0], in09); _mm256_storeu_si256((__m256i*)&dst[10 * 16 + 0], in10); _mm256_storeu_si256((__m256i*)&dst[11 * 16 + 0], in11); _mm256_storeu_si256((__m256i*)&dst[12 * 16 + 0], in12); _mm256_storeu_si256((__m256i*)&dst[13 * 16 + 0], in13); _mm256_storeu_si256((__m256i*)&dst[14 * 16 + 0], in14); _mm256_storeu_si256((__m256i*)&dst[15 * 16 + 0], in15); } void idct_32x32_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { int shift = 20 - g_bit_depth - (i_dst & 0x01); int clip = g_bit_depth + 1 + (i_dst & 0x01); int k, i; __m256i max_val, min_val; __m256i EEO0A, EEO1A, EEO2A, EEO3A, EEO0B, EEO1B, EEO2B, EEO3B; __m256i EEEO0A, EEEO0B, EEEO1A, EEEO1B; __m256i EEEE0A, EEEE0B, EEEE1A, EEEE1B; __m256i EEE0A, EEE0B, EEE1A, EEE1B, EEE3A, EEE3B, EEE2A, EEE2B; __m256i EE0A, EE0B, EE1A, EE1B, EE2A, EE2B, EE3A, EE3B, EE7A, EE7B, EE6A, EE6B, EE5A, EE5B, EE4A, EE4B; __m256i E0A, E0B, E1A, E1B, E2A, E2B, E3A, E3B, E4A, E4B, E5A, E5B, E6A, E6B, E7A, E7B, EFA, EFB, EEA, EEB, EDA, EDB, ECA, ECB, EBA, EBB, EAA, EAB, E9A, E9B, E8A, E8B; __m256i T10A, T10B, T11A, T11B, T12A, T12B, T13A, T13B, T14A, T14B, T15A, T15B, T16A, T16B, T17A, T17B, T18A, T18B, T19A, T19B, T1AA, T1AB, T1BA, T1BB, T1CA, T1CB, T1DA, T1DB, T1EA, T1EB, T1FA, T1FB; __m256i T2_00A, T2_00B, T2_01A, T2_01B, T2_02A, T2_02B, T2_03A, T2_03B, T2_04A, T2_04B, T2_05A, T2_05B, T2_06A, T2_06B, T2_07A, T2_07B, T2_08A, T2_08B, T2_09A, T2_09B, T2_10A, T2_10B, T2_11A, T2_11B, T2_12A, T2_12B, T2_13A, T2_13B, T2_14A, T2_14B, T2_15A, T2_15B, T2_31A, T2_31B, T2_30A, T2_30B, T2_29A, T2_29B, T2_28A, T2_28B, T2_27A, T2_27B, T2_26A, T2_26B, T2_25A, T2_25B, T2_24A, T2_24B, T2_23A, T2_23B, T2_22A, T2_22B, T2_21A, T2_21B, T2_20A, T2_20B, T2_19A, T2_19B, T2_18A, T2_18B, T2_17A, T2_17B, T2_16A, T2_16B; __m256i T3_00A, T3_00B, T3_01A, T3_01B, T3_02A, T3_02B, T3_03A, T3_03B, T3_04A, T3_04B, T3_05A, T3_05B, T3_06A, T3_06B, T3_07A, T3_07B, T3_08A, T3_08B, T3_09A, T3_09B, T3_10A, T3_10B, T3_11A, T3_11B, T3_12A, T3_12B, T3_13A, T3_13B, T3_14A, T3_14B, T3_15A, T3_15B; __m256i T3_16A, T3_16B, T3_17A, T3_17B, T3_18A, T3_18B, T3_19A, T3_19B, T3_20A, T3_20B, T3_21A, T3_21B, T3_22A, T3_22B, T3_23A, T3_23B, T3_24A, T3_24B, T3_25A, T3_25B, T3_26A, T3_26B, T3_27A, T3_27B, T3_28A, T3_28B, T3_29A, T3_29B, T3_30A, T3_30B, T3_31A, T3_31B; const __m256i c16_p45_p45 = _mm256_set1_epi32(0x002D002D); const __m256i c16_p43_p44 = _mm256_set1_epi32(0x002B002C); const __m256i c16_p39_p41 = _mm256_set1_epi32(0x00270029); const __m256i c16_p34_p36 = _mm256_set1_epi32(0x00220024); const __m256i c16_p27_p30 = _mm256_set1_epi32(0x001B001E); const __m256i c16_p19_p23 = _mm256_set1_epi32(0x00130017); const __m256i c16_p11_p15 = _mm256_set1_epi32(0x000B000F); const __m256i c16_p02_p07 = _mm256_set1_epi32(0x00020007); const __m256i c16_p41_p45 = _mm256_set1_epi32(0x0029002D); const __m256i c16_p23_p34 = _mm256_set1_epi32(0x00170022); const __m256i c16_n02_p11 = _mm256_set1_epi32(0xFFFE000B); const __m256i c16_n27_n15 = _mm256_set1_epi32(0xFFE5FFF1); const __m256i c16_n43_n36 = _mm256_set1_epi32(0xFFD5FFDC); const __m256i c16_n44_n45 = _mm256_set1_epi32(0xFFD4FFD3); const __m256i c16_n30_n39 = _mm256_set1_epi32(0xFFE2FFD9); const __m256i c16_n07_n19 = _mm256_set1_epi32(0xFFF9FFED); const __m256i c16_p34_p44 = _mm256_set1_epi32(0x0022002C); const __m256i c16_n07_p15 = _mm256_set1_epi32(0xFFF9000F); const __m256i c16_n41_n27 = _mm256_set1_epi32(0xFFD7FFE5); const __m256i c16_n39_n45 = _mm256_set1_epi32(0xFFD9FFD3); const __m256i c16_n02_n23 = _mm256_set1_epi32(0xFFFEFFE9); const __m256i c16_p36_p19 = _mm256_set1_epi32(0x00240013); const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); const __m256i c16_p11_p30 = _mm256_set1_epi32(0x000B001E); const __m256i c16_p23_p43 = _mm256_set1_epi32(0x0017002B); const __m256i c16_n34_n07 = _mm256_set1_epi32(0xFFDEFFF9); const __m256i c16_n36_n45 = _mm256_set1_epi32(0xFFDCFFD3); const __m256i c16_p19_n11 = _mm256_set1_epi32(0x0013FFF5); const __m256i c16_p44_p41 = _mm256_set1_epi32(0x002C0029); const __m256i c16_n02_p27 = _mm256_set1_epi32(0xFFFE001B); const __m256i c16_n45_n30 = _mm256_set1_epi32(0xFFD3FFE2); const __m256i c16_n15_n39 = _mm256_set1_epi32(0xFFF1FFD9); const __m256i c16_p11_p41 = _mm256_set1_epi32(0x000B0029); const __m256i c16_n45_n27 = _mm256_set1_epi32(0xFFD3FFE5); const __m256i c16_p07_n30 = _mm256_set1_epi32(0x0007FFE2); const __m256i c16_p43_p39 = _mm256_set1_epi32(0x002B0027); const __m256i c16_n23_p15 = _mm256_set1_epi32(0xFFE9000F); const __m256i c16_n34_n45 = _mm256_set1_epi32(0xFFDEFFD3); const __m256i c16_p36_p02 = _mm256_set1_epi32(0x00240002); const __m256i c16_p19_p44 = _mm256_set1_epi32(0x0013002C); const __m256i c16_n02_p39 = _mm256_set1_epi32(0xFFFE0027); const __m256i c16_n36_n41 = _mm256_set1_epi32(0xFFDCFFD7); const __m256i c16_p43_p07 = _mm256_set1_epi32(0x002B0007); const __m256i c16_n11_p34 = _mm256_set1_epi32(0xFFF50022); const __m256i c16_n30_n44 = _mm256_set1_epi32(0xFFE2FFD4); const __m256i c16_p45_p15 = _mm256_set1_epi32(0x002D000F); const __m256i c16_n19_p27 = _mm256_set1_epi32(0xFFED001B); const __m256i c16_n23_n45 = _mm256_set1_epi32(0xFFE9FFD3); const __m256i c16_n15_p36 = _mm256_set1_epi32(0xFFF10024); const __m256i c16_n11_n45 = _mm256_set1_epi32(0xFFF5FFD3); const __m256i c16_p34_p39 = _mm256_set1_epi32(0x00220027); const __m256i c16_n45_n19 = _mm256_set1_epi32(0xFFD3FFED); const __m256i c16_p41_n07 = _mm256_set1_epi32(0x0029FFF9); const __m256i c16_n23_p30 = _mm256_set1_epi32(0xFFE9001E); const __m256i c16_n02_n44 = _mm256_set1_epi32(0xFFFEFFD4); const __m256i c16_p27_p43 = _mm256_set1_epi32(0x001B002B); const __m256i c16_n27_p34 = _mm256_set1_epi32(0xFFE50022); const __m256i c16_p19_n39 = _mm256_set1_epi32(0x0013FFD9); const __m256i c16_n11_p43 = _mm256_set1_epi32(0xFFF5002B); const __m256i c16_p02_n45 = _mm256_set1_epi32(0x0002FFD3); const __m256i c16_p07_p45 = _mm256_set1_epi32(0x0007002D); const __m256i c16_n15_n44 = _mm256_set1_epi32(0xFFF1FFD4); const __m256i c16_p23_p41 = _mm256_set1_epi32(0x00170029); const __m256i c16_n30_n36 = _mm256_set1_epi32(0xFFE2FFDC); const __m256i c16_n36_p30 = _mm256_set1_epi32(0xFFDC001E); const __m256i c16_p41_n23 = _mm256_set1_epi32(0x0029FFE9); const __m256i c16_n44_p15 = _mm256_set1_epi32(0xFFD4000F); const __m256i c16_p45_n07 = _mm256_set1_epi32(0x002DFFF9); const __m256i c16_n45_n02 = _mm256_set1_epi32(0xFFD3FFFE); const __m256i c16_p43_p11 = _mm256_set1_epi32(0x002B000B); const __m256i c16_n39_n19 = _mm256_set1_epi32(0xFFD9FFED); const __m256i c16_p34_p27 = _mm256_set1_epi32(0x0022001B); const __m256i c16_n43_p27 = _mm256_set1_epi32(0xFFD5001B); const __m256i c16_p44_n02 = _mm256_set1_epi32(0x002CFFFE); const __m256i c16_n30_n23 = _mm256_set1_epi32(0xFFE2FFE9); const __m256i c16_p07_p41 = _mm256_set1_epi32(0x00070029); const __m256i c16_p19_n45 = _mm256_set1_epi32(0x0013FFD3); const __m256i c16_n39_p34 = _mm256_set1_epi32(0xFFD90022); const __m256i c16_p45_n11 = _mm256_set1_epi32(0x002DFFF5); const __m256i c16_n36_n15 = _mm256_set1_epi32(0xFFDCFFF1); const __m256i c16_n45_p23 = _mm256_set1_epi32(0xFFD30017); const __m256i c16_p27_p19 = _mm256_set1_epi32(0x001B0013); const __m256i c16_p15_n45 = _mm256_set1_epi32(0x000FFFD3); const __m256i c16_n44_p30 = _mm256_set1_epi32(0xFFD4001E); const __m256i c16_p34_p11 = _mm256_set1_epi32(0x0022000B); const __m256i c16_p07_n43 = _mm256_set1_epi32(0x0007FFD5); const __m256i c16_n41_p36 = _mm256_set1_epi32(0xFFD70024); const __m256i c16_p39_p02 = _mm256_set1_epi32(0x00270002); const __m256i c16_n44_p19 = _mm256_set1_epi32(0xFFD40013); const __m256i c16_n02_p36 = _mm256_set1_epi32(0xFFFE0024); const __m256i c16_p45_n34 = _mm256_set1_epi32(0x002DFFDE); const __m256i c16_n15_n23 = _mm256_set1_epi32(0xFFF1FFE9); const __m256i c16_n39_p43 = _mm256_set1_epi32(0xFFD9002B); const __m256i c16_p30_p07 = _mm256_set1_epi32(0x001E0007); const __m256i c16_p27_n45 = _mm256_set1_epi32(0x001BFFD3); const __m256i c16_n41_p11 = _mm256_set1_epi32(0xFFD7000B); const __m256i c16_n39_p15 = _mm256_set1_epi32(0xFFD9000F); const __m256i c16_n30_p45 = _mm256_set1_epi32(0xFFE2002D); const __m256i c16_p27_p02 = _mm256_set1_epi32(0x001B0002); const __m256i c16_p41_n44 = _mm256_set1_epi32(0x0029FFD4); const __m256i c16_n11_n19 = _mm256_set1_epi32(0xFFF5FFED); const __m256i c16_n45_p36 = _mm256_set1_epi32(0xFFD30024); const __m256i c16_n07_p34 = _mm256_set1_epi32(0xFFF90022); const __m256i c16_p43_n23 = _mm256_set1_epi32(0x002BFFE9); const __m256i c16_n30_p11 = _mm256_set1_epi32(0xFFE2000B); const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_n19_p36 = _mm256_set1_epi32(0xFFED0024); const __m256i c16_p23_n02 = _mm256_set1_epi32(0x0017FFFE); const __m256i c16_p45_n39 = _mm256_set1_epi32(0x002DFFD9); const __m256i c16_p27_n41 = _mm256_set1_epi32(0x001BFFD7); const __m256i c16_n15_n07 = _mm256_set1_epi32(0xFFF1FFF9); const __m256i c16_n44_p34 = _mm256_set1_epi32(0xFFD40022); const __m256i c16_n19_p07 = _mm256_set1_epi32(0xFFED0007); const __m256i c16_n39_p30 = _mm256_set1_epi32(0xFFD9001E); const __m256i c16_n45_p44 = _mm256_set1_epi32(0xFFD3002C); const __m256i c16_n36_p43 = _mm256_set1_epi32(0xFFDC002B); const __m256i c16_n15_p27 = _mm256_set1_epi32(0xFFF1001B); const __m256i c16_p11_p02 = _mm256_set1_epi32(0x000B0002); const __m256i c16_p34_n23 = _mm256_set1_epi32(0x0022FFE9); const __m256i c16_p45_n41 = _mm256_set1_epi32(0x002DFFD7); const __m256i c16_n07_p02 = _mm256_set1_epi32(0xFFF90002); const __m256i c16_n15_p11 = _mm256_set1_epi32(0xFFF1000B); const __m256i c16_n23_p19 = _mm256_set1_epi32(0xFFE90013); const __m256i c16_n30_p27 = _mm256_set1_epi32(0xFFE2001B); const __m256i c16_n36_p34 = _mm256_set1_epi32(0xFFDC0022); const __m256i c16_n41_p39 = _mm256_set1_epi32(0xFFD70027); const __m256i c16_n44_p43 = _mm256_set1_epi32(0xFFD4002B); const __m256i c16_n45_p45 = _mm256_set1_epi32(0xFFD3002D); // const __m256i c16_p43_p45 = _mm256_set1_epi32(0x002B002D); const __m256i c16_p35_p40 = _mm256_set1_epi32(0x00230028); const __m256i c16_p21_p29 = _mm256_set1_epi32(0x0015001D); const __m256i c16_p04_p13 = _mm256_set1_epi32(0x0004000D); const __m256i c16_p29_p43 = _mm256_set1_epi32(0x001D002B); const __m256i c16_n21_p04 = _mm256_set1_epi32(0xFFEB0004); const __m256i c16_n45_n40 = _mm256_set1_epi32(0xFFD3FFD8); const __m256i c16_n13_n35 = _mm256_set1_epi32(0xFFF3FFDD); const __m256i c16_p04_p40 = _mm256_set1_epi32(0x00040028); const __m256i c16_n43_n35 = _mm256_set1_epi32(0xFFD5FFDD); const __m256i c16_p29_n13 = _mm256_set1_epi32(0x001DFFF3); const __m256i c16_p21_p45 = _mm256_set1_epi32(0x0015002D); const __m256i c16_n21_p35 = _mm256_set1_epi32(0xFFEB0023); const __m256i c16_p04_n43 = _mm256_set1_epi32(0x0004FFD5); const __m256i c16_p13_p45 = _mm256_set1_epi32(0x000D002D); const __m256i c16_n29_n40 = _mm256_set1_epi32(0xFFE3FFD8); const __m256i c16_n40_p29 = _mm256_set1_epi32(0xFFD8001D); const __m256i c16_p45_n13 = _mm256_set1_epi32(0x002DFFF3); const __m256i c16_n43_n04 = _mm256_set1_epi32(0xFFD5FFFC); const __m256i c16_p35_p21 = _mm256_set1_epi32(0x00230015); const __m256i c16_n45_p21 = _mm256_set1_epi32(0xFFD30015); const __m256i c16_p13_p29 = _mm256_set1_epi32(0x000D001D); const __m256i c16_p35_n43 = _mm256_set1_epi32(0x0023FFD5); const __m256i c16_n40_p04 = _mm256_set1_epi32(0xFFD80004); const __m256i c16_n35_p13 = _mm256_set1_epi32(0xFFDD000D); const __m256i c16_n40_p45 = _mm256_set1_epi32(0xFFD8002D); const __m256i c16_p04_p21 = _mm256_set1_epi32(0x00040015); const __m256i c16_p43_n29 = _mm256_set1_epi32(0x002BFFE3); const __m256i c16_n13_p04 = _mm256_set1_epi32(0xFFF30004); const __m256i c16_n29_p21 = _mm256_set1_epi32(0xFFE30015); const __m256i c16_n40_p35 = _mm256_set1_epi32(0xFFD80023); //const __m256i c16_n45_p43 = _mm256_set1_epi32(0xFFD3002B); const __m256i c16_p38_p44 = _mm256_set1_epi32(0x0026002C); const __m256i c16_p09_p25 = _mm256_set1_epi32(0x00090019); const __m256i c16_n09_p38 = _mm256_set1_epi32(0xFFF70026); const __m256i c16_n25_n44 = _mm256_set1_epi32(0xFFE7FFD4); const __m256i c16_n44_p25 = _mm256_set1_epi32(0xFFD40019); const __m256i c16_p38_p09 = _mm256_set1_epi32(0x00260009); const __m256i c16_n25_p09 = _mm256_set1_epi32(0xFFE70009); const __m256i c16_n44_p38 = _mm256_set1_epi32(0xFFD40026); const __m256i c16_p17_p42 = _mm256_set1_epi32(0x0011002A); const __m256i c16_n42_p17 = _mm256_set1_epi32(0xFFD60011); const __m256i c16_p32_p32 = _mm256_set1_epi32(0x00200020); const __m256i c16_n32_p32 = _mm256_set1_epi32(0xFFE00020); __m256i c32_rnd = _mm256_set1_epi32(16); int nShift = 5; // DCT1 __m256i in00[2], in01[2], in02[2], in03[2], in04[2], in05[2], in06[2], in07[2], in08[2], in09[2], in10[2], in11[2], in12[2], in13[2], in14[2], in15[2]; __m256i in16[2], in17[2], in18[2], in19[2], in20[2], in21[2], in22[2], in23[2], in24[2], in25[2], in26[2], in27[2], in28[2], in29[2], in30[2], in31[2]; __m256i res00[2], res01[2], res02[2], res03[2], res04[2], res05[2], res06[2], res07[2], res08[2], res09[2], res10[2], res11[2], res12[2], res13[2], res14[2], res15[2]; __m256i res16[2], res17[2], res18[2], res19[2], res20[2], res21[2], res22[2], res23[2], res24[2], res25[2], res26[2], res27[2], res28[2], res29[2], res30[2], res31[2]; int pass, part; UNUSED_PARAMETER(i_dst); for (i = 0; i < 2; i++) { const int offset = (i << 4); in00[i] = _mm256_lddqu_si256((const __m256i*)&src[0 * 32 + offset]); in01[i] = _mm256_lddqu_si256((const __m256i*)&src[1 * 32 + offset]); in02[i] = _mm256_lddqu_si256((const __m256i*)&src[2 * 32 + offset]); in03[i] = _mm256_lddqu_si256((const __m256i*)&src[3 * 32 + offset]); in04[i] = _mm256_lddqu_si256((const __m256i*)&src[4 * 32 + offset]); in05[i] = _mm256_lddqu_si256((const __m256i*)&src[5 * 32 + offset]); in06[i] = _mm256_lddqu_si256((const __m256i*)&src[6 * 32 + offset]); in07[i] = _mm256_lddqu_si256((const __m256i*)&src[7 * 32 + offset]); in08[i] = _mm256_lddqu_si256((const __m256i*)&src[8 * 32 + offset]); in09[i] = _mm256_lddqu_si256((const __m256i*)&src[9 * 32 + offset]); in10[i] = _mm256_lddqu_si256((const __m256i*)&src[10 * 32 + offset]); in11[i] = _mm256_lddqu_si256((const __m256i*)&src[11 * 32 + offset]); in12[i] = _mm256_lddqu_si256((const __m256i*)&src[12 * 32 + offset]); in13[i] = _mm256_lddqu_si256((const __m256i*)&src[13 * 32 + offset]); in14[i] = _mm256_lddqu_si256((const __m256i*)&src[14 * 32 + offset]); in15[i] = _mm256_lddqu_si256((const __m256i*)&src[15 * 32 + offset]); in16[i] = _mm256_lddqu_si256((const __m256i*)&src[16 * 32 + offset]); in17[i] = _mm256_lddqu_si256((const __m256i*)&src[17 * 32 + offset]); in18[i] = _mm256_lddqu_si256((const __m256i*)&src[18 * 32 + offset]); in19[i] = _mm256_lddqu_si256((const __m256i*)&src[19 * 32 + offset]); in20[i] = _mm256_lddqu_si256((const __m256i*)&src[20 * 32 + offset]); in21[i] = _mm256_lddqu_si256((const __m256i*)&src[21 * 32 + offset]); in22[i] = _mm256_lddqu_si256((const __m256i*)&src[22 * 32 + offset]); in23[i] = _mm256_lddqu_si256((const __m256i*)&src[23 * 32 + offset]); in24[i] = _mm256_lddqu_si256((const __m256i*)&src[24 * 32 + offset]); in25[i] = _mm256_lddqu_si256((const __m256i*)&src[25 * 32 + offset]); in26[i] = _mm256_lddqu_si256((const __m256i*)&src[26 * 32 + offset]); in27[i] = _mm256_lddqu_si256((const __m256i*)&src[27 * 32 + offset]); in28[i] = _mm256_lddqu_si256((const __m256i*)&src[28 * 32 + offset]); in29[i] = _mm256_lddqu_si256((const __m256i*)&src[29 * 32 + offset]); in30[i] = _mm256_lddqu_si256((const __m256i*)&src[30 * 32 + offset]); in31[i] = _mm256_lddqu_si256((const __m256i*)&src[31 * 32 + offset]); } for (pass = 0; pass < 2; pass++) { for (part = 0; part < 2; part++) { const __m256i T_00_00A = _mm256_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10] const __m256i T_00_00B = _mm256_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14] const __m256i T_00_01A = _mm256_unpacklo_epi16(in05[part], in07[part]); // [ ] const __m256i T_00_01B = _mm256_unpackhi_epi16(in05[part], in07[part]); // [ ] const __m256i T_00_02A = _mm256_unpacklo_epi16(in09[part], in11[part]); // [ ] const __m256i T_00_02B = _mm256_unpackhi_epi16(in09[part], in11[part]); // [ ] const __m256i T_00_03A = _mm256_unpacklo_epi16(in13[part], in15[part]); // [ ] const __m256i T_00_03B = _mm256_unpackhi_epi16(in13[part], in15[part]); // [ ] const __m256i T_00_04A = _mm256_unpacklo_epi16(in17[part], in19[part]); // [ ] const __m256i T_00_04B = _mm256_unpackhi_epi16(in17[part], in19[part]); // [ ] const __m256i T_00_05A = _mm256_unpacklo_epi16(in21[part], in23[part]); // [ ] const __m256i T_00_05B = _mm256_unpackhi_epi16(in21[part], in23[part]); // [ ] const __m256i T_00_06A = _mm256_unpacklo_epi16(in25[part], in27[part]); // [ ] const __m256i T_00_06B = _mm256_unpackhi_epi16(in25[part], in27[part]); // [ ] const __m256i T_00_07A = _mm256_unpacklo_epi16(in29[part], in31[part]); // const __m256i T_00_07B = _mm256_unpackhi_epi16(in29[part], in31[part]); // [ ] const __m256i T_00_08A = _mm256_unpacklo_epi16(in02[part], in06[part]); // [ ] const __m256i T_00_08B = _mm256_unpackhi_epi16(in02[part], in06[part]); // [ ] const __m256i T_00_09A = _mm256_unpacklo_epi16(in10[part], in14[part]); // [ ] const __m256i T_00_09B = _mm256_unpackhi_epi16(in10[part], in14[part]); // [ ] const __m256i T_00_10A = _mm256_unpacklo_epi16(in18[part], in22[part]); // [ ] const __m256i T_00_10B = _mm256_unpackhi_epi16(in18[part], in22[part]); // [ ] const __m256i T_00_11A = _mm256_unpacklo_epi16(in26[part], in30[part]); // [ ] const __m256i T_00_11B = _mm256_unpackhi_epi16(in26[part], in30[part]); // [ ] const __m256i T_00_12A = _mm256_unpacklo_epi16(in04[part], in12[part]); // [ ] const __m256i T_00_12B = _mm256_unpackhi_epi16(in04[part], in12[part]); // [ ] const __m256i T_00_13A = _mm256_unpacklo_epi16(in20[part], in28[part]); // [ ] const __m256i T_00_13B = _mm256_unpackhi_epi16(in20[part], in28[part]); // [ ] const __m256i T_00_14A = _mm256_unpacklo_epi16(in08[part], in24[part]); // const __m256i T_00_14B = _mm256_unpackhi_epi16(in08[part], in24[part]); // [ ] const __m256i T_00_15A = _mm256_unpacklo_epi16(in00[part], in16[part]); // const __m256i T_00_15B = _mm256_unpackhi_epi16(in00[part], in16[part]); // [ ] __m256i O00A, O01A, O02A, O03A, O04A, O05A, O06A, O07A, O08A, O09A, O10A, O11A, O12A, O13A, O14A, O15A; __m256i O00B, O01B, O02B, O03B, O04B, O05B, O06B, O07B, O08B, O09B, O10B, O11B, O12B, O13B, O14B, O15B; __m256i EO0A, EO1A, EO2A, EO3A, EO4A, EO5A, EO6A, EO7A; __m256i EO0B, EO1B, EO2B, EO3B, EO4B, EO5B, EO6B, EO7B; { __m256i T00, T01, T02, T03; #define COMPUTE_ROW(r0103, r0507, r0911, r1315, r1719, r2123, r2527, r2931, c0103, c0507, c0911, c1315, c1719, c2123, c2527, c2931, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(r0103, c0103), _mm256_madd_epi16(r0507, c0507)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(r0911, c0911), _mm256_madd_epi16(r1315, c1315)); \ T02 = _mm256_add_epi32(_mm256_madd_epi16(r1719, c1719), _mm256_madd_epi16(r2123, c2123)); \ T03 = _mm256_add_epi32(_mm256_madd_epi16(r2527, c2527), _mm256_madd_epi16(r2931, c2931)); \ row = _mm256_add_epi32(_mm256_add_epi32(T00, T01), _mm256_add_epi32(T02, T03)); COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14A) COMPUTE_ROW(T_00_00A, T_00_01A, T_00_02A, T_00_03A, T_00_04A, T_00_05A, T_00_06A, T_00_07A, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15A) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p45_p45, c16_p43_p44, c16_p39_p41, c16_p34_p36, c16_p27_p30, c16_p19_p23, c16_p11_p15, c16_p02_p07, O00B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p41_p45, c16_p23_p34, c16_n02_p11, c16_n27_n15, c16_n43_n36, c16_n44_n45, c16_n30_n39, c16_n07_n19, O01B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p34_p44, c16_n07_p15, c16_n41_n27, c16_n39_n45, c16_n02_n23, c16_p36_p19, c16_p43_p45, c16_p11_p30, O02B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p23_p43, c16_n34_n07, c16_n36_n45, c16_p19_n11, c16_p44_p41, c16_n02_p27, c16_n45_n30, c16_n15_n39, O03B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_p11_p41, c16_n45_n27, c16_p07_n30, c16_p43_p39, c16_n23_p15, c16_n34_n45, c16_p36_p02, c16_p19_p44, O04B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n02_p39, c16_n36_n41, c16_p43_p07, c16_n11_p34, c16_n30_n44, c16_p45_p15, c16_n19_p27, c16_n23_n45, O05B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n15_p36, c16_n11_n45, c16_p34_p39, c16_n45_n19, c16_p41_n07, c16_n23_p30, c16_n02_n44, c16_p27_p43, O06B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n27_p34, c16_p19_n39, c16_n11_p43, c16_p02_n45, c16_p07_p45, c16_n15_n44, c16_p23_p41, c16_n30_n36, O07B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n36_p30, c16_p41_n23, c16_n44_p15, c16_p45_n07, c16_n45_n02, c16_p43_p11, c16_n39_n19, c16_p34_p27, O08B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n43_p27, c16_p44_n02, c16_n30_n23, c16_p07_p41, c16_p19_n45, c16_n39_p34, c16_p45_n11, c16_n36_n15, O09B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n45_p23, c16_p27_p19, c16_p15_n45, c16_n44_p30, c16_p34_p11, c16_p07_n43, c16_n41_p36, c16_p39_p02, O10B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n44_p19, c16_n02_p36, c16_p45_n34, c16_n15_n23, c16_n39_p43, c16_p30_p07, c16_p27_n45, c16_n41_p11, O11B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n39_p15, c16_n30_p45, c16_p27_p02, c16_p41_n44, c16_n11_n19, c16_n45_p36, c16_n07_p34, c16_p43_n23, O12B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n30_p11, c16_n45_p43, c16_n19_p36, c16_p23_n02, c16_p45_n39, c16_p27_n41, c16_n15_n07, c16_n44_p34, O13B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n19_p07, c16_n39_p30, c16_n45_p44, c16_n36_p43, c16_n15_p27, c16_p11_p02, c16_p34_n23, c16_p45_n41, O14B) COMPUTE_ROW(T_00_00B, T_00_01B, T_00_02B, T_00_03B, T_00_04B, T_00_05B, T_00_06B, T_00_07B, \ c16_n07_p02, c16_n15_p11, c16_n23_p19, c16_n30_p27, c16_n36_p34, c16_n41_p39, c16_n44_p43, c16_n45_p45, O15B) #undef COMPUTE_ROW } { __m256i T00, T01; #define COMPUTE_ROW(row0206, row1014, row1822, row2630, c0206, c1014, c1822, c2630, row) \ T00 = _mm256_add_epi32(_mm256_madd_epi16(row0206, c0206), _mm256_madd_epi16(row1014, c1014)); \ T01 = _mm256_add_epi32(_mm256_madd_epi16(row1822, c1822), _mm256_madd_epi16(row2630, c2630)); \ row = _mm256_add_epi32(T00, T01); COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6A) COMPUTE_ROW(T_00_08A, T_00_09A, T_00_10A, T_00_11A, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7A) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p43_p45, c16_p35_p40, c16_p21_p29, c16_p04_p13, EO0B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p29_p43, c16_n21_p04, c16_n45_n40, c16_n13_n35, EO1B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_p04_p40, c16_n43_n35, c16_p29_n13, c16_p21_p45, EO2B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n21_p35, c16_p04_n43, c16_p13_p45, c16_n29_n40, EO3B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n40_p29, c16_p45_n13, c16_n43_n04, c16_p35_p21, EO4B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n45_p21, c16_p13_p29, c16_p35_n43, c16_n40_p04, EO5B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n35_p13, c16_n40_p45, c16_p04_p21, c16_p43_n29, EO6B) COMPUTE_ROW(T_00_08B, T_00_09B, T_00_10B, T_00_11B, c16_n13_p04, c16_n29_p21, c16_n40_p35, c16_n45_p43, EO7B) #undef COMPUTE_ROW } EEO0A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_p38_p44), _mm256_madd_epi16(T_00_13A, c16_p09_p25)); EEO1A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n09_p38), _mm256_madd_epi16(T_00_13A, c16_n25_n44)); EEO2A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n44_p25), _mm256_madd_epi16(T_00_13A, c16_p38_p09)); EEO3A = _mm256_add_epi32(_mm256_madd_epi16(T_00_12A, c16_n25_p09), _mm256_madd_epi16(T_00_13A, c16_n44_p38)); EEO0B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_p38_p44), _mm256_madd_epi16(T_00_13B, c16_p09_p25)); EEO1B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n09_p38), _mm256_madd_epi16(T_00_13B, c16_n25_n44)); EEO2B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n44_p25), _mm256_madd_epi16(T_00_13B, c16_p38_p09)); EEO3B = _mm256_add_epi32(_mm256_madd_epi16(T_00_12B, c16_n25_p09), _mm256_madd_epi16(T_00_13B, c16_n44_p38)); EEEO0A = _mm256_madd_epi16(T_00_14A, c16_p17_p42); EEEO0B = _mm256_madd_epi16(T_00_14B, c16_p17_p42); EEEO1A = _mm256_madd_epi16(T_00_14A, c16_n42_p17); EEEO1B = _mm256_madd_epi16(T_00_14B, c16_n42_p17); EEEE0A = _mm256_madd_epi16(T_00_15A, c16_p32_p32); EEEE0B = _mm256_madd_epi16(T_00_15B, c16_p32_p32); EEEE1A = _mm256_madd_epi16(T_00_15A, c16_n32_p32); EEEE1B = _mm256_madd_epi16(T_00_15B, c16_n32_p32); EEE0A = _mm256_add_epi32(EEEE0A, EEEO0A); // EEE0 = EEEE0 + EEEO0 EEE0B = _mm256_add_epi32(EEEE0B, EEEO0B); EEE1A = _mm256_add_epi32(EEEE1A, EEEO1A); // EEE1 = EEEE1 + EEEO1 EEE1B = _mm256_add_epi32(EEEE1B, EEEO1B); EEE3A = _mm256_sub_epi32(EEEE0A, EEEO0A); // EEE2 = EEEE0 - EEEO0 EEE3B = _mm256_sub_epi32(EEEE0B, EEEO0B); EEE2A = _mm256_sub_epi32(EEEE1A, EEEO1A); // EEE3 = EEEE1 - EEEO1 EEE2B = _mm256_sub_epi32(EEEE1B, EEEO1B); EE0A = _mm256_add_epi32(EEE0A, EEO0A); // EE0 = EEE0 + EEO0 EE0B = _mm256_add_epi32(EEE0B, EEO0B); EE1A = _mm256_add_epi32(EEE1A, EEO1A); // EE1 = EEE1 + EEO1 EE1B = _mm256_add_epi32(EEE1B, EEO1B); EE2A = _mm256_add_epi32(EEE2A, EEO2A); // EE2 = EEE0 + EEO0 EE2B = _mm256_add_epi32(EEE2B, EEO2B); EE3A = _mm256_add_epi32(EEE3A, EEO3A); // EE3 = EEE1 + EEO1 EE3B = _mm256_add_epi32(EEE3B, EEO3B); EE7A = _mm256_sub_epi32(EEE0A, EEO0A); // EE7 = EEE0 - EEO0 EE7B = _mm256_sub_epi32(EEE0B, EEO0B); EE6A = _mm256_sub_epi32(EEE1A, EEO1A); // EE6 = EEE1 - EEO1 EE6B = _mm256_sub_epi32(EEE1B, EEO1B); EE5A = _mm256_sub_epi32(EEE2A, EEO2A); // EE5 = EEE0 - EEO0 EE5B = _mm256_sub_epi32(EEE2B, EEO2B); EE4A = _mm256_sub_epi32(EEE3A, EEO3A); // EE4 = EEE1 - EEO1 EE4B = _mm256_sub_epi32(EEE3B, EEO3B); E0A = _mm256_add_epi32(EE0A, EO0A); // E0 = EE0 + EO0 E0B = _mm256_add_epi32(EE0B, EO0B); E1A = _mm256_add_epi32(EE1A, EO1A); // E1 = EE1 + EO1 E1B = _mm256_add_epi32(EE1B, EO1B); E2A = _mm256_add_epi32(EE2A, EO2A); // E2 = EE2 + EO2 E2B = _mm256_add_epi32(EE2B, EO2B); E3A = _mm256_add_epi32(EE3A, EO3A); // E3 = EE3 + EO3 E3B = _mm256_add_epi32(EE3B, EO3B); E4A = _mm256_add_epi32(EE4A, EO4A); // E4 = E4B = _mm256_add_epi32(EE4B, EO4B); E5A = _mm256_add_epi32(EE5A, EO5A); // E5 = E5B = _mm256_add_epi32(EE5B, EO5B); E6A = _mm256_add_epi32(EE6A, EO6A); // E6 = E6B = _mm256_add_epi32(EE6B, EO6B); E7A = _mm256_add_epi32(EE7A, EO7A); // E7 = E7B = _mm256_add_epi32(EE7B, EO7B); EFA = _mm256_sub_epi32(EE0A, EO0A); // EF = EE0 - EO0 EFB = _mm256_sub_epi32(EE0B, EO0B); EEA = _mm256_sub_epi32(EE1A, EO1A); // EE = EE1 - EO1 EEB = _mm256_sub_epi32(EE1B, EO1B); EDA = _mm256_sub_epi32(EE2A, EO2A); // ED = EE2 - EO2 EDB = _mm256_sub_epi32(EE2B, EO2B); ECA = _mm256_sub_epi32(EE3A, EO3A); // EC = EE3 - EO3 ECB = _mm256_sub_epi32(EE3B, EO3B); EBA = _mm256_sub_epi32(EE4A, EO4A); // EB = EBB = _mm256_sub_epi32(EE4B, EO4B); EAA = _mm256_sub_epi32(EE5A, EO5A); // EA = EAB = _mm256_sub_epi32(EE5B, EO5B); E9A = _mm256_sub_epi32(EE6A, EO6A); // E9 = E9B = _mm256_sub_epi32(EE6B, EO6B); E8A = _mm256_sub_epi32(EE7A, EO7A); // E8 = E8B = _mm256_sub_epi32(EE7B, EO7B); T10A = _mm256_add_epi32(E0A, c32_rnd); // E0 + rnd T10B = _mm256_add_epi32(E0B, c32_rnd); T11A = _mm256_add_epi32(E1A, c32_rnd); // E1 + rnd T11B = _mm256_add_epi32(E1B, c32_rnd); T12A = _mm256_add_epi32(E2A, c32_rnd); // E2 + rnd T12B = _mm256_add_epi32(E2B, c32_rnd); T13A = _mm256_add_epi32(E3A, c32_rnd); // E3 + rnd T13B = _mm256_add_epi32(E3B, c32_rnd); T14A = _mm256_add_epi32(E4A, c32_rnd); // E4 + rnd T14B = _mm256_add_epi32(E4B, c32_rnd); T15A = _mm256_add_epi32(E5A, c32_rnd); // E5 + rnd T15B = _mm256_add_epi32(E5B, c32_rnd); T16A = _mm256_add_epi32(E6A, c32_rnd); // E6 + rnd T16B = _mm256_add_epi32(E6B, c32_rnd); T17A = _mm256_add_epi32(E7A, c32_rnd); // E7 + rnd T17B = _mm256_add_epi32(E7B, c32_rnd); T18A = _mm256_add_epi32(E8A, c32_rnd); // E8 + rnd T18B = _mm256_add_epi32(E8B, c32_rnd); T19A = _mm256_add_epi32(E9A, c32_rnd); // E9 + rnd T19B = _mm256_add_epi32(E9B, c32_rnd); T1AA = _mm256_add_epi32(EAA, c32_rnd); // E10 + rnd T1AB = _mm256_add_epi32(EAB, c32_rnd); T1BA = _mm256_add_epi32(EBA, c32_rnd); // E11 + rnd T1BB = _mm256_add_epi32(EBB, c32_rnd); T1CA = _mm256_add_epi32(ECA, c32_rnd); // E12 + rnd T1CB = _mm256_add_epi32(ECB, c32_rnd); T1DA = _mm256_add_epi32(EDA, c32_rnd); // E13 + rnd T1DB = _mm256_add_epi32(EDB, c32_rnd); T1EA = _mm256_add_epi32(EEA, c32_rnd); // E14 + rnd T1EB = _mm256_add_epi32(EEB, c32_rnd); T1FA = _mm256_add_epi32(EFA, c32_rnd); // E15 + rnd T1FB = _mm256_add_epi32(EFB, c32_rnd); T2_00A = _mm256_add_epi32(T10A, O00A); // E0 + O0 + rnd T2_00B = _mm256_add_epi32(T10B, O00B); T2_01A = _mm256_add_epi32(T11A, O01A); // E1 + O1 + rnd T2_01B = _mm256_add_epi32(T11B, O01B); T2_02A = _mm256_add_epi32(T12A, O02A); // E2 + O2 + rnd T2_02B = _mm256_add_epi32(T12B, O02B); T2_03A = _mm256_add_epi32(T13A, O03A); // E3 + O3 + rnd T2_03B = _mm256_add_epi32(T13B, O03B); T2_04A = _mm256_add_epi32(T14A, O04A); // E4 T2_04B = _mm256_add_epi32(T14B, O04B); T2_05A = _mm256_add_epi32(T15A, O05A); // E5 T2_05B = _mm256_add_epi32(T15B, O05B); T2_06A = _mm256_add_epi32(T16A, O06A); // E6 T2_06B = _mm256_add_epi32(T16B, O06B); T2_07A = _mm256_add_epi32(T17A, O07A); // E7 T2_07B = _mm256_add_epi32(T17B, O07B); T2_08A = _mm256_add_epi32(T18A, O08A); // E8 T2_08B = _mm256_add_epi32(T18B, O08B); T2_09A = _mm256_add_epi32(T19A, O09A); // E9 T2_09B = _mm256_add_epi32(T19B, O09B); T2_10A = _mm256_add_epi32(T1AA, O10A); // E10 T2_10B = _mm256_add_epi32(T1AB, O10B); T2_11A = _mm256_add_epi32(T1BA, O11A); // E11 T2_11B = _mm256_add_epi32(T1BB, O11B); T2_12A = _mm256_add_epi32(T1CA, O12A); // E12 T2_12B = _mm256_add_epi32(T1CB, O12B); T2_13A = _mm256_add_epi32(T1DA, O13A); // E13 T2_13B = _mm256_add_epi32(T1DB, O13B); T2_14A = _mm256_add_epi32(T1EA, O14A); // E14 T2_14B = _mm256_add_epi32(T1EB, O14B); T2_15A = _mm256_add_epi32(T1FA, O15A); // E15 T2_15B = _mm256_add_epi32(T1FB, O15B); T2_31A = _mm256_sub_epi32(T10A, O00A); // E0 - O0 + rnd T2_31B = _mm256_sub_epi32(T10B, O00B); T2_30A = _mm256_sub_epi32(T11A, O01A); // E1 - O1 + rnd T2_30B = _mm256_sub_epi32(T11B, O01B); T2_29A = _mm256_sub_epi32(T12A, O02A); // E2 - O2 + rnd T2_29B = _mm256_sub_epi32(T12B, O02B); T2_28A = _mm256_sub_epi32(T13A, O03A); // E3 - O3 + rnd T2_28B = _mm256_sub_epi32(T13B, O03B); T2_27A = _mm256_sub_epi32(T14A, O04A); // E4 T2_27B = _mm256_sub_epi32(T14B, O04B); T2_26A = _mm256_sub_epi32(T15A, O05A); // E5 T2_26B = _mm256_sub_epi32(T15B, O05B); T2_25A = _mm256_sub_epi32(T16A, O06A); // E6 T2_25B = _mm256_sub_epi32(T16B, O06B); T2_24A = _mm256_sub_epi32(T17A, O07A); // E7 T2_24B = _mm256_sub_epi32(T17B, O07B); T2_23A = _mm256_sub_epi32(T18A, O08A); // T2_23B = _mm256_sub_epi32(T18B, O08B); T2_22A = _mm256_sub_epi32(T19A, O09A); // T2_22B = _mm256_sub_epi32(T19B, O09B); T2_21A = _mm256_sub_epi32(T1AA, O10A); // T2_21B = _mm256_sub_epi32(T1AB, O10B); T2_20A = _mm256_sub_epi32(T1BA, O11A); // T2_20B = _mm256_sub_epi32(T1BB, O11B); T2_19A = _mm256_sub_epi32(T1CA, O12A); // T2_19B = _mm256_sub_epi32(T1CB, O12B); T2_18A = _mm256_sub_epi32(T1DA, O13A); // T2_18B = _mm256_sub_epi32(T1DB, O13B); T2_17A = _mm256_sub_epi32(T1EA, O14A); // T2_17B = _mm256_sub_epi32(T1EB, O14B); T2_16A = _mm256_sub_epi32(T1FA, O15A); // T2_16B = _mm256_sub_epi32(T1FB, O15B); T3_00A = _mm256_srai_epi32(T2_00A, nShift); // [30 20 10 00] // This operation make it much slower than 128 T3_00B = _mm256_srai_epi32(T2_00B, nShift); // [70 60 50 40] // This operation make it much slower than 128 T3_01A = _mm256_srai_epi32(T2_01A, nShift); // [31 21 11 01] // This operation make it much slower than 128 T3_01B = _mm256_srai_epi32(T2_01B, nShift); // [71 61 51 41] // This operation make it much slower than 128 T3_02A = _mm256_srai_epi32(T2_02A, nShift); // [32 22 12 02] // This operation make it much slower than 128 T3_02B = _mm256_srai_epi32(T2_02B, nShift); // [72 62 52 42] T3_03A = _mm256_srai_epi32(T2_03A, nShift); // [33 23 13 03] T3_03B = _mm256_srai_epi32(T2_03B, nShift); // [73 63 53 43] T3_04A = _mm256_srai_epi32(T2_04A, nShift); // [33 24 14 04] T3_04B = _mm256_srai_epi32(T2_04B, nShift); // [74 64 54 44] T3_05A = _mm256_srai_epi32(T2_05A, nShift); // [35 25 15 05] T3_05B = _mm256_srai_epi32(T2_05B, nShift); // [75 65 55 45] T3_06A = _mm256_srai_epi32(T2_06A, nShift); // [36 26 16 06] T3_06B = _mm256_srai_epi32(T2_06B, nShift); // [76 66 56 46] T3_07A = _mm256_srai_epi32(T2_07A, nShift); // [37 27 17 07] T3_07B = _mm256_srai_epi32(T2_07B, nShift); // [77 67 57 47] T3_08A = _mm256_srai_epi32(T2_08A, nShift); // [30 20 10 00] x8 T3_08B = _mm256_srai_epi32(T2_08B, nShift); // [70 60 50 40] T3_09A = _mm256_srai_epi32(T2_09A, nShift); // [31 21 11 01] x9 T3_09B = _mm256_srai_epi32(T2_09B, nShift); // [71 61 51 41] T3_10A = _mm256_srai_epi32(T2_10A, nShift); // [32 22 12 02] xA T3_10B = _mm256_srai_epi32(T2_10B, nShift); // [72 62 52 42] T3_11A = _mm256_srai_epi32(T2_11A, nShift); // [33 23 13 03] xB T3_11B = _mm256_srai_epi32(T2_11B, nShift); // [73 63 53 43] T3_12A = _mm256_srai_epi32(T2_12A, nShift); // [33 24 14 04] xC T3_12B = _mm256_srai_epi32(T2_12B, nShift); // [74 64 54 44] T3_13A = _mm256_srai_epi32(T2_13A, nShift); // [35 25 15 05] xD T3_13B = _mm256_srai_epi32(T2_13B, nShift); // [75 65 55 45] T3_14A = _mm256_srai_epi32(T2_14A, nShift); // [36 26 16 06] xE T3_14B = _mm256_srai_epi32(T2_14B, nShift); // [76 66 56 46] T3_15A = _mm256_srai_epi32(T2_15A, nShift); // [37 27 17 07] xF T3_15B = _mm256_srai_epi32(T2_15B, nShift); // [77 67 57 47] T3_16A = _mm256_srai_epi32(T2_16A, nShift); // [30 20 10 00] // This operation make it much slower than 128 T3_16B = _mm256_srai_epi32(T2_16B, nShift); // [70 60 50 40] // This operation make it much slower than 128 T3_17A = _mm256_srai_epi32(T2_17A, nShift); // [31 21 11 01] // This operation make it much slower than 128 T3_17B = _mm256_srai_epi32(T2_17B, nShift); // [71 61 51 41] T3_18A = _mm256_srai_epi32(T2_18A, nShift); // [32 22 12 02] T3_18B = _mm256_srai_epi32(T2_18B, nShift); // [72 62 52 42] T3_19A = _mm256_srai_epi32(T2_19A, nShift); // [33 23 13 03] T3_19B = _mm256_srai_epi32(T2_19B, nShift); // [73 63 53 43] T3_20A = _mm256_srai_epi32(T2_20A, nShift); // [33 24 14 04] T3_20B = _mm256_srai_epi32(T2_20B, nShift); // [74 64 54 44] T3_21A = _mm256_srai_epi32(T2_21A, nShift); // [35 25 15 05] T3_21B = _mm256_srai_epi32(T2_21B, nShift); // [75 65 55 45] T3_22A = _mm256_srai_epi32(T2_22A, nShift); // [36 26 16 06] T3_22B = _mm256_srai_epi32(T2_22B, nShift); // [76 66 56 46] T3_23A = _mm256_srai_epi32(T2_23A, nShift); // [37 27 17 07] T3_23B = _mm256_srai_epi32(T2_23B, nShift); // [77 67 57 47] T3_24A = _mm256_srai_epi32(T2_24A, nShift); // [30 20 10 00] x8 T3_24B = _mm256_srai_epi32(T2_24B, nShift); // [70 60 50 40] T3_25A = _mm256_srai_epi32(T2_25A, nShift); // [31 21 11 01] x9 T3_25B = _mm256_srai_epi32(T2_25B, nShift); // [71 61 51 41] T3_26A = _mm256_srai_epi32(T2_26A, nShift); // [32 22 12 02] xA T3_26B = _mm256_srai_epi32(T2_26B, nShift); // [72 62 52 42] T3_27A = _mm256_srai_epi32(T2_27A, nShift); // [33 23 13 03] xB T3_27B = _mm256_srai_epi32(T2_27B, nShift); // [73 63 53 43] T3_28A = _mm256_srai_epi32(T2_28A, nShift); // [33 24 14 04] xC T3_28B = _mm256_srai_epi32(T2_28B, nShift); // [74 64 54 44] T3_29A = _mm256_srai_epi32(T2_29A, nShift); // [35 25 15 05] xD T3_29B = _mm256_srai_epi32(T2_29B, nShift); // [75 65 55 45] T3_30A = _mm256_srai_epi32(T2_30A, nShift); // [36 26 16 06] xE T3_30B = _mm256_srai_epi32(T2_30B, nShift); // [76 66 56 46] T3_31A = _mm256_srai_epi32(T2_31A, nShift); // [37 27 17 07] xF T3_31B = _mm256_srai_epi32(T2_31B, nShift); // [77 67 57 47] res00[part] = _mm256_packs_epi32(T3_00A, T3_00B); // [70 60 50 40 30 20 10 00] res01[part] = _mm256_packs_epi32(T3_01A, T3_01B); // [71 61 51 41 31 21 11 01] res02[part] = _mm256_packs_epi32(T3_02A, T3_02B); // [72 62 52 42 32 22 12 02] res03[part] = _mm256_packs_epi32(T3_03A, T3_03B); // [73 63 53 43 33 23 13 03] res04[part] = _mm256_packs_epi32(T3_04A, T3_04B); // [74 64 54 44 34 24 14 04] res05[part] = _mm256_packs_epi32(T3_05A, T3_05B); // [75 65 55 45 35 25 15 05] res06[part] = _mm256_packs_epi32(T3_06A, T3_06B); // [76 66 56 46 36 26 16 06] res07[part] = _mm256_packs_epi32(T3_07A, T3_07B); // [77 67 57 47 37 27 17 07] res08[part] = _mm256_packs_epi32(T3_08A, T3_08B); // [A0 ... 80] res09[part] = _mm256_packs_epi32(T3_09A, T3_09B); // [A1 ... 81] res10[part] = _mm256_packs_epi32(T3_10A, T3_10B); // [A2 ... 82] res11[part] = _mm256_packs_epi32(T3_11A, T3_11B); // [A3 ... 83] res12[part] = _mm256_packs_epi32(T3_12A, T3_12B); // [A4 ... 84] res13[part] = _mm256_packs_epi32(T3_13A, T3_13B); // [A5 ... 85] res14[part] = _mm256_packs_epi32(T3_14A, T3_14B); // [A6 ... 86] res15[part] = _mm256_packs_epi32(T3_15A, T3_15B); // [A7 ... 87] res16[part] = _mm256_packs_epi32(T3_16A, T3_16B); res17[part] = _mm256_packs_epi32(T3_17A, T3_17B); res18[part] = _mm256_packs_epi32(T3_18A, T3_18B); res19[part] = _mm256_packs_epi32(T3_19A, T3_19B); res20[part] = _mm256_packs_epi32(T3_20A, T3_20B); res21[part] = _mm256_packs_epi32(T3_21A, T3_21B); res22[part] = _mm256_packs_epi32(T3_22A, T3_22B); res23[part] = _mm256_packs_epi32(T3_23A, T3_23B); res24[part] = _mm256_packs_epi32(T3_24A, T3_24B); res25[part] = _mm256_packs_epi32(T3_25A, T3_25B); res26[part] = _mm256_packs_epi32(T3_26A, T3_26B); res27[part] = _mm256_packs_epi32(T3_27A, T3_27B); res28[part] = _mm256_packs_epi32(T3_28A, T3_28B); res29[part] = _mm256_packs_epi32(T3_29A, T3_29B); res30[part] = _mm256_packs_epi32(T3_30A, T3_30B); res31[part] = _mm256_packs_epi32(T3_31A, T3_31B); } //transpose 32x32 matrix { __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7, tr0_8, tr0_9, tr0_10, tr0_11, tr0_12, tr0_13, tr0_14, tr0_15; #define TRANSPOSE_16x16_16BIT(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpacklo_epi16(I4, I5); \ tr0_3 = _mm256_unpacklo_epi16(I6, I7); \ tr0_4 = _mm256_unpacklo_epi16(I8, I9); \ tr0_5 = _mm256_unpacklo_epi16(I10, I11); \ tr0_6 = _mm256_unpacklo_epi16(I12, I13); \ tr0_7 = _mm256_unpacklo_epi16(I14, I15); \ tr0_8 = _mm256_unpackhi_epi16(I0, I1); \ tr0_9 = _mm256_unpackhi_epi16(I2, I3); \ tr0_10 = _mm256_unpackhi_epi16(I4, I5); \ tr0_11 = _mm256_unpackhi_epi16(I6, I7); \ tr0_12 = _mm256_unpackhi_epi16(I8, I9); \ tr0_13 = _mm256_unpackhi_epi16(I10, I11); \ tr0_14 = _mm256_unpackhi_epi16(I12, I13); \ tr0_15 = _mm256_unpackhi_epi16(I14, I15); \ O0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ O1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ O2 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ O3 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ O4 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ O5 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ O6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ O7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O8 = _mm256_unpacklo_epi32(tr0_8, tr0_9); \ O9 = _mm256_unpacklo_epi32(tr0_10, tr0_11); \ O10 = _mm256_unpacklo_epi32(tr0_12, tr0_13); \ O11 = _mm256_unpacklo_epi32(tr0_14, tr0_15); \ O12 = _mm256_unpackhi_epi32(tr0_8, tr0_9); \ O13 = _mm256_unpackhi_epi32(tr0_10, tr0_11); \ O14 = _mm256_unpackhi_epi32(tr0_12, tr0_13); \ O15 = _mm256_unpackhi_epi32(tr0_14, tr0_15); \ tr0_0 = _mm256_unpacklo_epi64(O0, O1); \ tr0_1 = _mm256_unpacklo_epi64(O2, O3); \ tr0_2 = _mm256_unpackhi_epi64(O0, O1); \ tr0_3 = _mm256_unpackhi_epi64(O2, O3); \ tr0_4 = _mm256_unpacklo_epi64(O4, O5); \ tr0_5 = _mm256_unpacklo_epi64(O6, O7); \ tr0_6 = _mm256_unpackhi_epi64(O4, O5); \ tr0_7 = _mm256_unpackhi_epi64(O6, O7); \ tr0_8 = _mm256_unpacklo_epi64(O8, O9); \ tr0_9 = _mm256_unpacklo_epi64(O10, O11); \ tr0_10 = _mm256_unpackhi_epi64(O8, O9); \ tr0_11 = _mm256_unpackhi_epi64(O10, O11); \ tr0_12 = _mm256_unpacklo_epi64(O12, O13); \ tr0_13 = _mm256_unpacklo_epi64(O14, O15); \ tr0_14 = _mm256_unpackhi_epi64(O12, O13); \ tr0_15 = _mm256_unpackhi_epi64(O14, O15); \ O0 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x20); \ O1 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x20); \ O2 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x20); \ O3 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x20); \ O4 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x20); \ O5 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x20); \ O6 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x20); \ O7 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x20); \ O8 = _mm256_permute2x128_si256(tr0_0, tr0_1, 0x31); \ O9 = _mm256_permute2x128_si256(tr0_2, tr0_3, 0x31); \ O10 = _mm256_permute2x128_si256(tr0_4, tr0_5, 0x31); \ O11 = _mm256_permute2x128_si256(tr0_6, tr0_7, 0x31); \ O12 = _mm256_permute2x128_si256(tr0_8, tr0_9, 0x31); \ O13 = _mm256_permute2x128_si256(tr0_10, tr0_11, 0x31); \ O14 = _mm256_permute2x128_si256(tr0_12, tr0_13, 0x31); \ O15 = _mm256_permute2x128_si256(tr0_14, tr0_15, 0x31); \ TRANSPOSE_16x16_16BIT(res00[0], res01[0], res02[0], res03[0], res04[0], res05[0], res06[0], res07[0], res08[0], res09[0], res10[0], res11[0], res12[0], res13[0], res14[0], res15[0], in00[0], in01[0], in02[0], in03[0], in04[0], in05[0], in06[0], in07[0], in08[0], in09[0], in10[0], in11[0], in12[0], in13[0], in14[0], in15[0]) TRANSPOSE_16x16_16BIT(res16[0], res17[0], res18[0], res19[0], res20[0], res21[0], res22[0], res23[0], res24[0], res25[0], res26[0], res27[0], res28[0], res29[0], res30[0], res31[0], in00[1], in01[1], in02[1], in03[1], in04[1], in05[1], in06[1], in07[1], in08[1], in09[1], in10[1], in11[1], in12[1], in13[1], in14[1], in15[1]); TRANSPOSE_16x16_16BIT(res00[1], res01[1], res02[1], res03[1], res04[1], res05[1], res06[1], res07[1], res08[1], res09[1], res10[1], res11[1], res12[1], res13[1], res14[1], res15[1], in16[0], in17[0], in18[0], in19[0], in20[0], in21[0], in22[0], in23[0], in24[0], in25[0], in26[0], in27[0], in28[0], in29[0], in30[0], in31[0]); TRANSPOSE_16x16_16BIT(res16[1], res17[1], res18[1], res19[1], res20[1], res21[1], res22[1], res23[1], res24[1], res25[1], res26[1], res27[1], res28[1], res29[1], res30[1], res31[1], in16[1], in17[1], in18[1], in19[1], in20[1], in21[1], in22[1], in23[1], in24[1], in25[1], in26[1], in27[1], in28[1], in29[1], in30[1], in31[1]); #undef TRANSPOSE_16x16_16BIT } c32_rnd = _mm256_set1_epi32(shift ? (1 << (shift - 1)) : 0); // pass == 1 ڶ nShift = shift; } // clip max_val = _mm256_set1_epi16((1 << (clip - 1)) - 1); min_val = _mm256_set1_epi16(-(1 << (clip - 1))); for (k = 0; k < 2; k++) { in00[k] = _mm256_max_epi16(_mm256_min_epi16(in00[k], max_val), min_val); in01[k] = _mm256_max_epi16(_mm256_min_epi16(in01[k], max_val), min_val); in02[k] = _mm256_max_epi16(_mm256_min_epi16(in02[k], max_val), min_val); in03[k] = _mm256_max_epi16(_mm256_min_epi16(in03[k], max_val), min_val); in04[k] = _mm256_max_epi16(_mm256_min_epi16(in04[k], max_val), min_val); in05[k] = _mm256_max_epi16(_mm256_min_epi16(in05[k], max_val), min_val); in06[k] = _mm256_max_epi16(_mm256_min_epi16(in06[k], max_val), min_val); in07[k] = _mm256_max_epi16(_mm256_min_epi16(in07[k], max_val), min_val); in08[k] = _mm256_max_epi16(_mm256_min_epi16(in08[k], max_val), min_val); in09[k] = _mm256_max_epi16(_mm256_min_epi16(in09[k], max_val), min_val); in10[k] = _mm256_max_epi16(_mm256_min_epi16(in10[k], max_val), min_val); in11[k] = _mm256_max_epi16(_mm256_min_epi16(in11[k], max_val), min_val); in12[k] = _mm256_max_epi16(_mm256_min_epi16(in12[k], max_val), min_val); in13[k] = _mm256_max_epi16(_mm256_min_epi16(in13[k], max_val), min_val); in14[k] = _mm256_max_epi16(_mm256_min_epi16(in14[k], max_val), min_val); in15[k] = _mm256_max_epi16(_mm256_min_epi16(in15[k], max_val), min_val); in16[k] = _mm256_max_epi16(_mm256_min_epi16(in16[k], max_val), min_val); in17[k] = _mm256_max_epi16(_mm256_min_epi16(in17[k], max_val), min_val); in18[k] = _mm256_max_epi16(_mm256_min_epi16(in18[k], max_val), min_val); in19[k] = _mm256_max_epi16(_mm256_min_epi16(in19[k], max_val), min_val); in20[k] = _mm256_max_epi16(_mm256_min_epi16(in20[k], max_val), min_val); in21[k] = _mm256_max_epi16(_mm256_min_epi16(in21[k], max_val), min_val); in22[k] = _mm256_max_epi16(_mm256_min_epi16(in22[k], max_val), min_val); in23[k] = _mm256_max_epi16(_mm256_min_epi16(in23[k], max_val), min_val); in24[k] = _mm256_max_epi16(_mm256_min_epi16(in24[k], max_val), min_val); in25[k] = _mm256_max_epi16(_mm256_min_epi16(in25[k], max_val), min_val); in26[k] = _mm256_max_epi16(_mm256_min_epi16(in26[k], max_val), min_val); in27[k] = _mm256_max_epi16(_mm256_min_epi16(in27[k], max_val), min_val); in28[k] = _mm256_max_epi16(_mm256_min_epi16(in28[k], max_val), min_val); in29[k] = _mm256_max_epi16(_mm256_min_epi16(in29[k], max_val), min_val); in30[k] = _mm256_max_epi16(_mm256_min_epi16(in30[k], max_val), min_val); in31[k] = _mm256_max_epi16(_mm256_min_epi16(in31[k], max_val), min_val); } // Store for (i = 0; i < 2; i++) { const int offset = (i << 4); _mm256_storeu_si256((__m256i*)&dst[0 * 32 + offset], in00[i]); _mm256_storeu_si256((__m256i*)&dst[1 * 32 + offset], in01[i]); _mm256_storeu_si256((__m256i*)&dst[2 * 32 + offset], in02[i]); _mm256_storeu_si256((__m256i*)&dst[3 * 32 + offset], in03[i]); _mm256_storeu_si256((__m256i*)&dst[4 * 32 + offset], in04[i]); _mm256_storeu_si256((__m256i*)&dst[5 * 32 + offset], in05[i]); _mm256_storeu_si256((__m256i*)&dst[6 * 32 + offset], in06[i]); _mm256_storeu_si256((__m256i*)&dst[7 * 32 + offset], in07[i]); _mm256_storeu_si256((__m256i*)&dst[8 * 32 + offset], in08[i]); _mm256_storeu_si256((__m256i*)&dst[9 * 32 + offset], in09[i]); _mm256_storeu_si256((__m256i*)&dst[10 * 32 + offset], in10[i]); _mm256_storeu_si256((__m256i*)&dst[11 * 32 + offset], in11[i]); _mm256_storeu_si256((__m256i*)&dst[12 * 32 + offset], in12[i]); _mm256_storeu_si256((__m256i*)&dst[13 * 32 + offset], in13[i]); _mm256_storeu_si256((__m256i*)&dst[14 * 32 + offset], in14[i]); _mm256_storeu_si256((__m256i*)&dst[15 * 32 + offset], in15[i]); _mm256_storeu_si256((__m256i*)&dst[16 * 32 + offset], in16[i]); _mm256_storeu_si256((__m256i*)&dst[17 * 32 + offset], in17[i]); _mm256_storeu_si256((__m256i*)&dst[18 * 32 + offset], in18[i]); _mm256_storeu_si256((__m256i*)&dst[19 * 32 + offset], in19[i]); _mm256_storeu_si256((__m256i*)&dst[20 * 32 + offset], in20[i]); _mm256_storeu_si256((__m256i*)&dst[21 * 32 + offset], in21[i]); _mm256_storeu_si256((__m256i*)&dst[22 * 32 + offset], in22[i]); _mm256_storeu_si256((__m256i*)&dst[23 * 32 + offset], in23[i]); _mm256_storeu_si256((__m256i*)&dst[24 * 32 + offset], in24[i]); _mm256_storeu_si256((__m256i*)&dst[25 * 32 + offset], in25[i]); _mm256_storeu_si256((__m256i*)&dst[26 * 32 + offset], in26[i]); _mm256_storeu_si256((__m256i*)&dst[27 * 32 + offset], in27[i]); _mm256_storeu_si256((__m256i*)&dst[28 * 32 + offset], in28[i]); _mm256_storeu_si256((__m256i*)&dst[29 * 32 + offset], in29[i]); _mm256_storeu_si256((__m256i*)&dst[30 * 32 + offset], in30[i]); _mm256_storeu_si256((__m256i*)&dst[31 * 32 + offset], in31[i]); } } #define TRANSPOSE_8x8_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, O0, O1, O2, O3, O4, O5, O6, O7) \ tr0_0 = _mm256_unpacklo_epi16(I0, I1); \ tr0_1 = _mm256_unpacklo_epi16(I2, I3); \ tr0_2 = _mm256_unpackhi_epi16(I0, I1); \ tr0_3 = _mm256_unpackhi_epi16(I2, I3); \ tr0_4 = _mm256_unpacklo_epi16(I4, I5); \ tr0_5 = _mm256_unpacklo_epi16(I6, I7); \ tr0_6 = _mm256_unpackhi_epi16(I4, I5); \ tr0_7 = _mm256_unpackhi_epi16(I6, I7); \ tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); \ tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); \ tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); \ tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); \ tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); \ tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); \ tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); \ tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); \ O0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); \ O1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); \ O2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); \ O3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); \ O4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); \ O5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); \ O6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); \ O7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); #define TRANSPOSE_16x16_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, O0, O1, O2, O3, O4, O5, O6, O7, O8, O9, O10, O11, O12, O13, O14, O15) \ TRANSPOSE_8x8_16BIT_m256i(I0, I1, I2, I3, I4, I5, I6, I7, t0, t1, t2, t3, t4, t5, t6, t7); \ TRANSPOSE_8x8_16BIT_m256i(I8, I9, I10, I11, I12, I13, I14, I15, t8, t9, t10, t11, t12, t13, t14, t15); \ O0 = _mm256_permute2x128_si256(t0, t8, 0x20); \ O1 = _mm256_permute2x128_si256(t1, t9, 0x20); \ O2 = _mm256_permute2x128_si256(t2, t10, 0x20); \ O3 = _mm256_permute2x128_si256(t3, t11, 0x20); \ O4 = _mm256_permute2x128_si256(t4, t12, 0x20); \ O5 = _mm256_permute2x128_si256(t5, t13, 0x20); \ O6 = _mm256_permute2x128_si256(t6, t14, 0x20); \ O7 = _mm256_permute2x128_si256(t7, t15, 0x20); \ O8 = _mm256_permute2x128_si256(t0, t8, 0x31); \ O9 = _mm256_permute2x128_si256(t1, t9, 0x31); \ O10 = _mm256_permute2x128_si256(t2, t10, 0x31); \ O11 = _mm256_permute2x128_si256(t3, t11, 0x31); \ O12 = _mm256_permute2x128_si256(t4, t12, 0x31); \ O13 = _mm256_permute2x128_si256(t5, t13, 0x31); \ O14 = _mm256_permute2x128_si256(t6, t14, 0x31); \ O15 = _mm256_permute2x128_si256(t7, t15, 0x31); //inv_wavelet_64x16_sse128 void inv_wavelet_64x16_avx2(coeff_t *coeff) { int i; __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; // 64*16 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; /*--vertical transform--*/ //32*8, LOAD AND SHIFT T00[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 0]), 1); T01[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 1]), 1); T02[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 2]), 1); T03[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 3]), 1); T04[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 4]), 1); T05[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 5]), 1); T06[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 6]), 1); T07[0] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[0 + 32 * 7]), 1); T00[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 0]), 1); T01[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 1]), 1); T02[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 2]), 1); T03[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 3]), 1); T04[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 4]), 1); T05[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 5]), 1); T06[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 6]), 1); T07[1] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 + 32 * 7]), 1); //filter (odd pixel/row) T08[0] = _mm256_srai_epi16(_mm256_add_epi16(T00[0], T01[0]), 1); T09[0] = _mm256_srai_epi16(_mm256_add_epi16(T01[0], T02[0]), 1); T10[0] = _mm256_srai_epi16(_mm256_add_epi16(T02[0], T03[0]), 1); T11[0] = _mm256_srai_epi16(_mm256_add_epi16(T03[0], T04[0]), 1); T12[0] = _mm256_srai_epi16(_mm256_add_epi16(T04[0], T05[0]), 1); T13[0] = _mm256_srai_epi16(_mm256_add_epi16(T05[0], T06[0]), 1); T14[0] = _mm256_srai_epi16(_mm256_add_epi16(T06[0], T07[0]), 1); T15[0] = _mm256_srai_epi16(_mm256_add_epi16(T07[0], T07[0]), 1); T08[1] = _mm256_srai_epi16(_mm256_add_epi16(T00[1], T01[1]), 1); T09[1] = _mm256_srai_epi16(_mm256_add_epi16(T01[1], T02[1]), 1); T10[1] = _mm256_srai_epi16(_mm256_add_epi16(T02[1], T03[1]), 1); T11[1] = _mm256_srai_epi16(_mm256_add_epi16(T03[1], T04[1]), 1); T12[1] = _mm256_srai_epi16(_mm256_add_epi16(T04[1], T05[1]), 1); T13[1] = _mm256_srai_epi16(_mm256_add_epi16(T05[1], T06[1]), 1); T14[1] = _mm256_srai_epi16(_mm256_add_epi16(T06[1], T07[1]), 1); T15[1] = _mm256_srai_epi16(_mm256_add_epi16(T07[1], T07[1]), 1); /*--transposition--*/ //32x16 -> 16x32 TRANSPOSE_16x16_16BIT_m256i(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15); TRANSPOSE_16x16_16BIT_m256i(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31); /*--horizontal transform--*/ //filter (odd pixel/column) V32 = _mm256_srai_epi16(_mm256_add_epi16(V00, V01), 1); V33 = _mm256_srai_epi16(_mm256_add_epi16(V01, V02), 1); V34 = _mm256_srai_epi16(_mm256_add_epi16(V02, V03), 1); V35 = _mm256_srai_epi16(_mm256_add_epi16(V03, V04), 1); V36 = _mm256_srai_epi16(_mm256_add_epi16(V04, V05), 1); V37 = _mm256_srai_epi16(_mm256_add_epi16(V05, V06), 1); V38 = _mm256_srai_epi16(_mm256_add_epi16(V06, V07), 1); V39 = _mm256_srai_epi16(_mm256_add_epi16(V07, V08), 1); V40 = _mm256_srai_epi16(_mm256_add_epi16(V08, V09), 1); V41 = _mm256_srai_epi16(_mm256_add_epi16(V09, V10), 1); V42 = _mm256_srai_epi16(_mm256_add_epi16(V10, V11), 1); V43 = _mm256_srai_epi16(_mm256_add_epi16(V11, V12), 1); V44 = _mm256_srai_epi16(_mm256_add_epi16(V12, V13), 1); V45 = _mm256_srai_epi16(_mm256_add_epi16(V13, V14), 1); V46 = _mm256_srai_epi16(_mm256_add_epi16(V14, V15), 1); V47 = _mm256_srai_epi16(_mm256_add_epi16(V15, V16), 1); V48 = _mm256_srai_epi16(_mm256_add_epi16(V16, V17), 1); V49 = _mm256_srai_epi16(_mm256_add_epi16(V17, V18), 1); V50 = _mm256_srai_epi16(_mm256_add_epi16(V18, V19), 1); V51 = _mm256_srai_epi16(_mm256_add_epi16(V19, V20), 1); V52 = _mm256_srai_epi16(_mm256_add_epi16(V20, V21), 1); V53 = _mm256_srai_epi16(_mm256_add_epi16(V21, V22), 1); V54 = _mm256_srai_epi16(_mm256_add_epi16(V22, V23), 1); V55 = _mm256_srai_epi16(_mm256_add_epi16(V23, V24), 1); V56 = _mm256_srai_epi16(_mm256_add_epi16(V24, V25), 1); V57 = _mm256_srai_epi16(_mm256_add_epi16(V25, V26), 1); V58 = _mm256_srai_epi16(_mm256_add_epi16(V26, V27), 1); V59 = _mm256_srai_epi16(_mm256_add_epi16(V27, V28), 1); V60 = _mm256_srai_epi16(_mm256_add_epi16(V28, V29), 1); V61 = _mm256_srai_epi16(_mm256_add_epi16(V29, V30), 1); V62 = _mm256_srai_epi16(_mm256_add_epi16(V30, V31), 1); V63 = _mm256_srai_epi16(_mm256_add_epi16(V31, V31), 1); /*--transposition & Store--*/ //16x64 -> 64x16 TRANSPOSE_16x16_16BIT_m256i(V00, V32, V01, V33, V02, V34, V03, V35, V04, V36, V05, V37, V06, V38, V07, V39, T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_16x16_16BIT_m256i(V08, V40, V09, V41, V10, V42, V11, V43, V12, V44, V13, V45, V14, V46, V15, V47, T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT_m256i(V16, V48, V17, V49, V18, V50, V19, V51, V20, V52, V21, V53, V22, V54, V23, V55, T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_16x16_16BIT_m256i(V24, V56, V25, V57, V26, V58, V27, V59, V28, V60, V29, V61, V30, V62, V31, V63, T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); //store for (i = 0; i < 4; i++) { _mm256_storeu_si256((__m256i*)&coeff[16 * i], T00[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64], T01[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 2], T02[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 3], T03[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 4], T04[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 5], T05[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 6], T06[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 7], T07[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 8], T08[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 9], T09[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 10], T10[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 11], T11[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 12], T12[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 13], T13[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 14], T14[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 15], T15[i]); } } void inv_wavelet_16x64_avx2(coeff_t *coeff) { //src blk 8*32 __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; __m256i S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31; __m256i S32, S33, S34, S35, S36, S37, S38, S39, S40, S41, S42, S43, S44, S45, S46, S47, S48, S49, S50, S51, S52, S53, S54, S55, S56, S57, S58, S59, S60, S61, S62, S63; // 64*16 __m256i TT00[8], TT01[8], TT02[8], TT03[8], TT04[8], TT05[8], TT06[8], TT07[8]; __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4]; // 16*64 __m256i V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31, V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47, V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63; int i; /*--load & shift--*/ //8*32 S00 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 0]), 1); S01 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 1]), 1); S02 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 2]), 1); S03 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 3]), 1); S04 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 4]), 1); S05 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 5]), 1); S06 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 6]), 1); S07 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 7]), 1); S08 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 8]), 1); S09 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 9]), 1); S10 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 10]), 1); S11 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 11]), 1); S12 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 12]), 1); S13 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 13]), 1); S14 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 14]), 1); S15 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 15]), 1); S16 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 16]), 1); S17 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 17]), 1); S18 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 18]), 1); S19 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 19]), 1); S20 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 20]), 1); S21 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 21]), 1); S22 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 22]), 1); S23 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 23]), 1); S24 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 24]), 1); S25 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 25]), 1); S26 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 26]), 1); S27 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 27]), 1); S28 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 28]), 1); S29 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 29]), 1); S30 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 30]), 1); S31 = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[8 * 31]), 1); /*--vertical transform--*/ S32 = _mm256_srai_epi16(_mm256_add_epi16(S00, S01), 1); S33 = _mm256_srai_epi16(_mm256_add_epi16(S01, S02), 1); S34 = _mm256_srai_epi16(_mm256_add_epi16(S02, S03), 1); S35 = _mm256_srai_epi16(_mm256_add_epi16(S03, S04), 1); S36 = _mm256_srai_epi16(_mm256_add_epi16(S04, S05), 1); S37 = _mm256_srai_epi16(_mm256_add_epi16(S05, S06), 1); S38 = _mm256_srai_epi16(_mm256_add_epi16(S06, S07), 1); S39 = _mm256_srai_epi16(_mm256_add_epi16(S07, S08), 1); S40 = _mm256_srai_epi16(_mm256_add_epi16(S08, S09), 1); S41 = _mm256_srai_epi16(_mm256_add_epi16(S09, S10), 1); S42 = _mm256_srai_epi16(_mm256_add_epi16(S10, S11), 1); S43 = _mm256_srai_epi16(_mm256_add_epi16(S11, S12), 1); S44 = _mm256_srai_epi16(_mm256_add_epi16(S12, S13), 1); S45 = _mm256_srai_epi16(_mm256_add_epi16(S13, S14), 1); S46 = _mm256_srai_epi16(_mm256_add_epi16(S14, S15), 1); S47 = _mm256_srai_epi16(_mm256_add_epi16(S15, S16), 1); S48 = _mm256_srai_epi16(_mm256_add_epi16(S16, S17), 1); S49 = _mm256_srai_epi16(_mm256_add_epi16(S17, S18), 1); S50 = _mm256_srai_epi16(_mm256_add_epi16(S18, S19), 1); S51 = _mm256_srai_epi16(_mm256_add_epi16(S19, S20), 1); S52 = _mm256_srai_epi16(_mm256_add_epi16(S20, S21), 1); S53 = _mm256_srai_epi16(_mm256_add_epi16(S21, S22), 1); S54 = _mm256_srai_epi16(_mm256_add_epi16(S22, S23), 1); S55 = _mm256_srai_epi16(_mm256_add_epi16(S23, S24), 1); S56 = _mm256_srai_epi16(_mm256_add_epi16(S24, S25), 1); S57 = _mm256_srai_epi16(_mm256_add_epi16(S25, S26), 1); S58 = _mm256_srai_epi16(_mm256_add_epi16(S26, S27), 1); S59 = _mm256_srai_epi16(_mm256_add_epi16(S27, S28), 1); S60 = _mm256_srai_epi16(_mm256_add_epi16(S28, S29), 1); S61 = _mm256_srai_epi16(_mm256_add_epi16(S29, S30), 1); S62 = _mm256_srai_epi16(_mm256_add_epi16(S30, S31), 1); S63 = _mm256_srai_epi16(_mm256_add_epi16(S31, S31), 1); /*--transposition--*/ //8x64 -> 64x8 TRANSPOSE_8x8_16BIT_m256i(S00, S32, S01, S33, S02, S34, S03, S35, TT00[0], TT01[0], TT02[0], TT03[0], TT04[0], TT05[0], TT06[0], TT07[0]); TRANSPOSE_8x8_16BIT_m256i(S04, S36, S05, S37, S06, S38, S07, S39, TT00[1], TT01[1], TT02[1], TT03[1], TT04[1], TT05[1], TT06[1], TT07[1]); TRANSPOSE_8x8_16BIT_m256i(S08, S40, S09, S41, S10, S42, S11, S43, TT00[2], TT01[2], TT02[2], TT03[2], TT04[2], TT05[2], TT06[2], TT07[2]); TRANSPOSE_8x8_16BIT_m256i(S12, S44, S13, S45, S14, S46, S15, S47, TT00[3], TT01[3], TT02[3], TT03[3], TT04[3], TT05[3], TT06[3], TT07[3]); TRANSPOSE_8x8_16BIT_m256i(S16, S48, S17, S49, S18, S50, S19, S51, TT00[4], TT01[4], TT02[4], TT03[4], TT04[4], TT05[4], TT06[4], TT07[4]); TRANSPOSE_8x8_16BIT_m256i(S20, S52, S21, S53, S22, S54, S23, S55, TT00[5], TT01[5], TT02[5], TT03[5], TT04[5], TT05[5], TT06[5], TT07[5]); TRANSPOSE_8x8_16BIT_m256i(S24, S56, S25, S57, S26, S58, S27, S59, TT00[6], TT01[6], TT02[6], TT03[6], TT04[6], TT05[6], TT06[6], TT07[6]); TRANSPOSE_8x8_16BIT_m256i(S28, S60, S29, S61, S30, S62, S31, S63, TT00[7], TT01[7], TT02[7], TT03[7], TT04[7], TT05[7], TT06[7], TT07[7]); T00[0] = _mm256_permute2x128_si256(TT00[0], TT00[1], 0x20); T00[1] = _mm256_permute2x128_si256(TT00[2], TT00[3], 0x20); T00[2] = _mm256_permute2x128_si256(TT00[4], TT00[5], 0x20); T00[3] = _mm256_permute2x128_si256(TT00[6], TT00[7], 0x20); T01[0] = _mm256_permute2x128_si256(TT01[0], TT01[1], 0x20); T01[1] = _mm256_permute2x128_si256(TT01[2], TT01[3], 0x20); T01[2] = _mm256_permute2x128_si256(TT01[4], TT01[5], 0x20); T01[3] = _mm256_permute2x128_si256(TT01[6], TT01[7], 0x20); T02[0] = _mm256_permute2x128_si256(TT02[0], TT02[1], 0x20); T02[1] = _mm256_permute2x128_si256(TT02[2], TT02[3], 0x20); T02[2] = _mm256_permute2x128_si256(TT02[4], TT02[5], 0x20); T02[3] = _mm256_permute2x128_si256(TT02[6], TT02[7], 0x20); T03[0] = _mm256_permute2x128_si256(TT03[0], TT03[1], 0x20); T03[1] = _mm256_permute2x128_si256(TT03[2], TT03[3], 0x20); T03[2] = _mm256_permute2x128_si256(TT03[4], TT03[5], 0x20); T03[3] = _mm256_permute2x128_si256(TT03[6], TT03[7], 0x20); T04[0] = _mm256_permute2x128_si256(TT04[0], TT04[1], 0x20); T04[1] = _mm256_permute2x128_si256(TT04[2], TT04[3], 0x20); T04[2] = _mm256_permute2x128_si256(TT04[4], TT04[5], 0x20); T04[3] = _mm256_permute2x128_si256(TT04[6], TT04[7], 0x20); T05[0] = _mm256_permute2x128_si256(TT05[0], TT05[1], 0x20); T05[1] = _mm256_permute2x128_si256(TT05[2], TT05[3], 0x20); T05[2] = _mm256_permute2x128_si256(TT05[4], TT05[5], 0x20); T05[3] = _mm256_permute2x128_si256(TT05[6], TT05[7], 0x20); T06[0] = _mm256_permute2x128_si256(TT06[0], TT06[1], 0x20); T06[1] = _mm256_permute2x128_si256(TT06[2], TT06[3], 0x20); T06[2] = _mm256_permute2x128_si256(TT06[4], TT06[5], 0x20); T06[3] = _mm256_permute2x128_si256(TT06[6], TT06[7], 0x20); T07[0] = _mm256_permute2x128_si256(TT07[0], TT07[1], 0x20); T07[1] = _mm256_permute2x128_si256(TT07[2], TT07[3], 0x20); T07[2] = _mm256_permute2x128_si256(TT07[4], TT07[5], 0x20); T07[3] = _mm256_permute2x128_si256(TT07[6], TT07[7], 0x20); /*--horizontal transform--*/ for (i = 0; i < 4; i++) { T08[i] = _mm256_srai_epi16(_mm256_add_epi16(T00[i], T01[i]), 1); T09[i] = _mm256_srai_epi16(_mm256_add_epi16(T01[i], T02[i]), 1); T10[i] = _mm256_srai_epi16(_mm256_add_epi16(T02[i], T03[i]), 1); T11[i] = _mm256_srai_epi16(_mm256_add_epi16(T03[i], T04[i]), 1); T12[i] = _mm256_srai_epi16(_mm256_add_epi16(T04[i], T05[i]), 1); T13[i] = _mm256_srai_epi16(_mm256_add_epi16(T05[i], T06[i]), 1); T14[i] = _mm256_srai_epi16(_mm256_add_epi16(T06[i], T07[i]), 1); T15[i] = _mm256_srai_epi16(_mm256_add_epi16(T07[i], T07[i]), 1); } /*--transposition--*/ //64x16 -> 16x64 TRANSPOSE_16x16_16BIT_m256i(T00[0], T08[0], T01[0], T09[0], T02[0], T10[0], T03[0], T11[0], T04[0], T12[0], T05[0], T13[0], T06[0], T14[0], T07[0], T15[0], V00, V01, V02, V03, V04, V05, V06, V07, V08, V09, V10, V11, V12, V13, V14, V15); TRANSPOSE_16x16_16BIT_m256i(T00[1], T08[1], T01[1], T09[1], T02[1], T10[1], T03[1], T11[1], T04[1], T12[1], T05[1], T13[1], T06[1], T14[1], T07[1], T15[1], V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29, V30, V31); TRANSPOSE_16x16_16BIT_m256i(T00[2], T08[2], T01[2], T09[2], T02[2], T10[2], T03[2], T11[2], T04[2], T12[2], T05[2], T13[2], T06[2], T14[2], T07[2], T15[2], V32, V33, V34, V35, V36, V37, V38, V39, V40, V41, V42, V43, V44, V45, V46, V47); TRANSPOSE_16x16_16BIT_m256i(T00[3], T08[3], T01[3], T09[3], T02[3], T10[3], T03[3], T11[3], T04[3], T12[3], T05[3], T13[3], T06[3], T14[3], T07[3], T15[3], V48, V49, V50, V51, V52, V53, V54, V55, V56, V57, V58, V59, V60, V61, V62, V63); /*--Store--*/ //16x64 _mm256_storeu_si256((__m256i*)&coeff[16 * 0], V00); _mm256_storeu_si256((__m256i*)&coeff[16 * 1], V01); _mm256_storeu_si256((__m256i*)&coeff[16 * 2], V02); _mm256_storeu_si256((__m256i*)&coeff[16 * 3], V03); _mm256_storeu_si256((__m256i*)&coeff[16 * 4], V04); _mm256_storeu_si256((__m256i*)&coeff[16 * 5], V05); _mm256_storeu_si256((__m256i*)&coeff[16 * 6], V06); _mm256_storeu_si256((__m256i*)&coeff[16 * 7], V07); _mm256_storeu_si256((__m256i*)&coeff[16 * 8], V08); _mm256_storeu_si256((__m256i*)&coeff[16 * 9], V09); _mm256_storeu_si256((__m256i*)&coeff[16 * 10], V10); _mm256_storeu_si256((__m256i*)&coeff[16 * 11], V11); _mm256_storeu_si256((__m256i*)&coeff[16 * 12], V12); _mm256_storeu_si256((__m256i*)&coeff[16 * 13], V13); _mm256_storeu_si256((__m256i*)&coeff[16 * 14], V14); _mm256_storeu_si256((__m256i*)&coeff[16 * 15], V15); _mm256_storeu_si256((__m256i*)&coeff[16 * 16], V16); _mm256_storeu_si256((__m256i*)&coeff[16 * 17], V17); _mm256_storeu_si256((__m256i*)&coeff[16 * 18], V18); _mm256_storeu_si256((__m256i*)&coeff[16 * 19], V19); _mm256_storeu_si256((__m256i*)&coeff[16 * 20], V20); _mm256_storeu_si256((__m256i*)&coeff[16 * 21], V21); _mm256_storeu_si256((__m256i*)&coeff[16 * 22], V22); _mm256_storeu_si256((__m256i*)&coeff[16 * 23], V23); _mm256_storeu_si256((__m256i*)&coeff[16 * 24], V24); _mm256_storeu_si256((__m256i*)&coeff[16 * 25], V25); _mm256_storeu_si256((__m256i*)&coeff[16 * 26], V26); _mm256_storeu_si256((__m256i*)&coeff[16 * 27], V27); _mm256_storeu_si256((__m256i*)&coeff[16 * 28], V28); _mm256_storeu_si256((__m256i*)&coeff[16 * 29], V29); _mm256_storeu_si256((__m256i*)&coeff[16 * 30], V30); _mm256_storeu_si256((__m256i*)&coeff[16 * 31], V31); _mm256_storeu_si256((__m256i*)&coeff[16 * 32], V32); _mm256_storeu_si256((__m256i*)&coeff[16 * 33], V33); _mm256_storeu_si256((__m256i*)&coeff[16 * 34], V34); _mm256_storeu_si256((__m256i*)&coeff[16 * 35], V35); _mm256_storeu_si256((__m256i*)&coeff[16 * 36], V36); _mm256_storeu_si256((__m256i*)&coeff[16 * 37], V37); _mm256_storeu_si256((__m256i*)&coeff[16 * 38], V38); _mm256_storeu_si256((__m256i*)&coeff[16 * 39], V39); _mm256_storeu_si256((__m256i*)&coeff[16 * 40], V40); _mm256_storeu_si256((__m256i*)&coeff[16 * 41], V41); _mm256_storeu_si256((__m256i*)&coeff[16 * 42], V42); _mm256_storeu_si256((__m256i*)&coeff[16 * 43], V43); _mm256_storeu_si256((__m256i*)&coeff[16 * 44], V44); _mm256_storeu_si256((__m256i*)&coeff[16 * 45], V45); _mm256_storeu_si256((__m256i*)&coeff[16 * 46], V46); _mm256_storeu_si256((__m256i*)&coeff[16 * 47], V47); _mm256_storeu_si256((__m256i*)&coeff[16 * 48], V48); _mm256_storeu_si256((__m256i*)&coeff[16 * 49], V49); _mm256_storeu_si256((__m256i*)&coeff[16 * 50], V50); _mm256_storeu_si256((__m256i*)&coeff[16 * 51], V51); _mm256_storeu_si256((__m256i*)&coeff[16 * 52], V52); _mm256_storeu_si256((__m256i*)&coeff[16 * 53], V53); _mm256_storeu_si256((__m256i*)&coeff[16 * 54], V54); _mm256_storeu_si256((__m256i*)&coeff[16 * 55], V55); _mm256_storeu_si256((__m256i*)&coeff[16 * 56], V56); _mm256_storeu_si256((__m256i*)&coeff[16 * 57], V57); _mm256_storeu_si256((__m256i*)&coeff[16 * 58], V58); _mm256_storeu_si256((__m256i*)&coeff[16 * 59], V59); _mm256_storeu_si256((__m256i*)&coeff[16 * 60], V60); _mm256_storeu_si256((__m256i*)&coeff[16 * 61], V61); _mm256_storeu_si256((__m256i*)&coeff[16 * 62], V62); _mm256_storeu_si256((__m256i*)&coeff[16 * 63], V63); } void inv_wavelet_64x64_avx2(coeff_t *coeff) { int i; __m256i tr0_0, tr0_1, tr0_2, tr0_3, tr0_4, tr0_5, tr0_6, tr0_7; __m256i tr1_0, tr1_1, tr1_2, tr1_3, tr1_4, tr1_5, tr1_6, tr1_7; __m256i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; // 64*64 __m256i T00[4], T01[4], T02[4], T03[4], T04[4], T05[4], T06[4], T07[4], T08[4], T09[4], T10[4], T11[4], T12[4], T13[4], T14[4], T15[4], T16[4], T17[4], T18[4], T19[4], T20[4], T21[4], T22[4], T23[4], T24[4], T25[4], T26[4], T27[4], T28[4], T29[4], T30[4], T31[4], T32[4], T33[4], T34[4], T35[4], T36[4], T37[4], T38[4], T39[4], T40[4], T41[4], T42[4], T43[4], T44[4], T45[4], T46[4], T47[4], T48[4], T49[4], T50[4], T51[4], T52[4], T53[4], T54[4], T55[4], T56[4], T57[4], T58[4], T59[4], T60[4], T61[4], T62[4], T63[4]; // 64*64 __m256i V00[4], V01[4], V02[4], V03[4], V04[4], V05[4], V06[4], V07[4], V08[4], V09[4], V10[4], V11[4], V12[4], V13[4], V14[4], V15[4], V16[4], V17[4], V18[4], V19[4], V20[4], V21[4], V22[4], V23[4], V24[4], V25[4], V26[4], V27[4], V28[4], V29[4], V30[4], V31[4], V32[4], V33[4], V34[4], V35[4], V36[4], V37[4], V38[4], V39[4], V40[4], V41[4], V42[4], V43[4], V44[4], V45[4], V46[4], V47[4], V48[4], V49[4], V50[4], V51[4], V52[4], V53[4], V54[4], V55[4], V56[4], V57[4], V58[4], V59[4], V60[4], V61[4], V62[4], V63[4]; /*--vertical transform--*/ //32*32, LOAD AND SHIFT for (i = 0; i < 2; i++) { T00[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 0]), 1); T01[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 1]), 1); T02[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 2]), 1); T03[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 3]), 1); T04[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 4]), 1); T05[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 5]), 1); T06[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 6]), 1); T07[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 7]), 1); T08[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 8]), 1); T09[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 9]), 1); T10[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 10]), 1); T11[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 11]), 1); T12[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 12]), 1); T13[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 13]), 1); T14[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 14]), 1); T15[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 15]), 1); T16[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 16]), 1); T17[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 17]), 1); T18[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 18]), 1); T19[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 19]), 1); T20[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 20]), 1); T21[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 21]), 1); T22[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 22]), 1); T23[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 23]), 1); T24[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 24]), 1); T25[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 25]), 1); T26[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 26]), 1); T27[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 27]), 1); T28[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 28]), 1); T29[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 29]), 1); T30[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 30]), 1); T31[i] = _mm256_srai_epi16(_mm256_loadu_si256((__m256i*)&coeff[16 * i + 32 * 31]), 1); } //filter (odd pixel/row) for (i = 0; i < 4; i++) { T32[i] = _mm256_srai_epi16(_mm256_add_epi16(T00[i], T01[i]), 1); T33[i] = _mm256_srai_epi16(_mm256_add_epi16(T01[i], T02[i]), 1); T34[i] = _mm256_srai_epi16(_mm256_add_epi16(T02[i], T03[i]), 1); T35[i] = _mm256_srai_epi16(_mm256_add_epi16(T03[i], T04[i]), 1); T36[i] = _mm256_srai_epi16(_mm256_add_epi16(T04[i], T05[i]), 1); T37[i] = _mm256_srai_epi16(_mm256_add_epi16(T05[i], T06[i]), 1); T38[i] = _mm256_srai_epi16(_mm256_add_epi16(T06[i], T07[i]), 1); T39[i] = _mm256_srai_epi16(_mm256_add_epi16(T07[i], T08[i]), 1); T40[i] = _mm256_srai_epi16(_mm256_add_epi16(T08[i], T09[i]), 1); T41[i] = _mm256_srai_epi16(_mm256_add_epi16(T09[i], T10[i]), 1); T42[i] = _mm256_srai_epi16(_mm256_add_epi16(T10[i], T11[i]), 1); T43[i] = _mm256_srai_epi16(_mm256_add_epi16(T11[i], T12[i]), 1); T44[i] = _mm256_srai_epi16(_mm256_add_epi16(T12[i], T13[i]), 1); T45[i] = _mm256_srai_epi16(_mm256_add_epi16(T13[i], T14[i]), 1); T46[i] = _mm256_srai_epi16(_mm256_add_epi16(T14[i], T15[i]), 1); T47[i] = _mm256_srai_epi16(_mm256_add_epi16(T15[i], T16[i]), 1); T48[i] = _mm256_srai_epi16(_mm256_add_epi16(T16[i], T17[i]), 1); T49[i] = _mm256_srai_epi16(_mm256_add_epi16(T17[i], T18[i]), 1); T50[i] = _mm256_srai_epi16(_mm256_add_epi16(T18[i], T19[i]), 1); T51[i] = _mm256_srai_epi16(_mm256_add_epi16(T19[i], T20[i]), 1); T52[i] = _mm256_srai_epi16(_mm256_add_epi16(T20[i], T21[i]), 1); T53[i] = _mm256_srai_epi16(_mm256_add_epi16(T21[i], T22[i]), 1); T54[i] = _mm256_srai_epi16(_mm256_add_epi16(T22[i], T23[i]), 1); T55[i] = _mm256_srai_epi16(_mm256_add_epi16(T23[i], T24[i]), 1); T56[i] = _mm256_srai_epi16(_mm256_add_epi16(T24[i], T25[i]), 1); T57[i] = _mm256_srai_epi16(_mm256_add_epi16(T25[i], T26[i]), 1); T58[i] = _mm256_srai_epi16(_mm256_add_epi16(T26[i], T27[i]), 1); T59[i] = _mm256_srai_epi16(_mm256_add_epi16(T27[i], T28[i]), 1); T60[i] = _mm256_srai_epi16(_mm256_add_epi16(T28[i], T29[i]), 1); T61[i] = _mm256_srai_epi16(_mm256_add_epi16(T29[i], T30[i]), 1); T62[i] = _mm256_srai_epi16(_mm256_add_epi16(T30[i], T31[i]), 1); T63[i] = _mm256_srai_epi16(_mm256_add_epi16(T31[i], T31[i]), 1); } /*--transposition--*/ //32x64 -> 64x32 TRANSPOSE_16x16_16BIT_m256i(T00[0], T32[0], T01[0], T33[0], T02[0], T34[0], T03[0], T35[0], T04[0], T36[0], T05[0], T37[0], T06[0], T38[0], T07[0], T39[0], V00[0], V01[0], V02[0], V03[0], V04[0], V05[0], V06[0], V07[0], V08[0], V09[0], V10[0], V11[0], V12[0], V13[0], V14[0], V15[0]); TRANSPOSE_16x16_16BIT_m256i(T08[0], T40[0], T09[0], T41[0], T10[0], T42[0], T11[0], T43[0], T12[0], T44[0], T13[0], T45[0], T14[0], T46[0], T15[0], T47[0], V00[1], V01[1], V02[1], V03[1], V04[1], V05[1], V06[1], V07[1], V08[1], V09[1], V10[1], V11[1], V12[1], V13[1], V14[1], V15[1]); TRANSPOSE_16x16_16BIT_m256i(T16[0], T48[0], T17[0], T49[0], T18[0], T50[0], T19[0], T51[0], T20[0], T52[0], T21[0], T53[0], T22[0], T54[0], T23[0], T55[0], V00[2], V01[2], V02[2], V03[2], V04[2], V05[2], V06[2], V07[2], V08[2], V09[2], V10[2], V11[2], V12[2], V13[2], V14[2], V15[2]); TRANSPOSE_16x16_16BIT_m256i(T24[0], T56[0], T25[0], T57[0], T26[0], T58[0], T27[0], T59[0], T28[0], T60[0], T29[0], T61[0], T30[0], T62[0], T31[0], T63[0], V00[3], V01[3], V02[3], V03[3], V04[3], V05[3], V06[3], V07[3], V08[3], V09[3], V10[3], V11[3], V12[3], V13[3], V14[3], V15[3]); TRANSPOSE_16x16_16BIT_m256i(T00[1], T32[1], T01[1], T33[1], T02[1], T34[1], T03[1], T35[1], T04[1], T36[1], T05[1], T37[1], T06[1], T38[1], T07[1], T39[1], V16[0], V17[0], V18[0], V19[0], V20[0], V21[0], V22[0], V23[0], V24[0], V25[0], V26[0], V27[0], V28[0], V29[0], V30[0], V31[0]); TRANSPOSE_16x16_16BIT_m256i(T08[1], T40[1], T09[1], T41[1], T10[1], T42[1], T11[1], T43[1], T12[1], T44[1], T13[1], T45[1], T14[1], T46[1], T15[1], T47[1], V16[1], V17[1], V18[1], V19[1], V20[1], V21[1], V22[1], V23[1], V24[1], V25[1], V26[1], V27[1], V28[1], V29[1], V30[1], V31[1]); TRANSPOSE_16x16_16BIT_m256i(T16[1], T48[1], T17[1], T49[1], T18[1], T50[1], T19[1], T51[1], T20[1], T52[1], T21[1], T53[1], T22[1], T54[1], T23[1], T55[1], V16[2], V17[2], V18[2], V19[2], V20[2], V21[2], V22[2], V23[2], V24[2], V25[2], V26[2], V27[2], V28[2], V29[2], V30[2], V31[2]); TRANSPOSE_16x16_16BIT_m256i(T24[1], T56[1], T25[1], T57[1], T26[1], T58[1], T27[1], T59[1], T28[1], T60[1], T29[1], T61[1], T30[1], T62[1], T31[1], T63[1], V16[3], V17[3], V18[3], V19[3], V20[3], V21[3], V22[3], V23[3], V24[3], V25[3], V26[3], V27[3], V28[3], V29[3], V30[3], V31[3]); /*--horizontal transform--*/ //filter (odd pixel/column) for (i = 0; i < 4; i++) { V32[i] = _mm256_srai_epi16(_mm256_add_epi16(V00[i], V01[i]), 1); V33[i] = _mm256_srai_epi16(_mm256_add_epi16(V01[i], V02[i]), 1); V34[i] = _mm256_srai_epi16(_mm256_add_epi16(V02[i], V03[i]), 1); V35[i] = _mm256_srai_epi16(_mm256_add_epi16(V03[i], V04[i]), 1); V36[i] = _mm256_srai_epi16(_mm256_add_epi16(V04[i], V05[i]), 1); V37[i] = _mm256_srai_epi16(_mm256_add_epi16(V05[i], V06[i]), 1); V38[i] = _mm256_srai_epi16(_mm256_add_epi16(V06[i], V07[i]), 1); V39[i] = _mm256_srai_epi16(_mm256_add_epi16(V07[i], V08[i]), 1); V40[i] = _mm256_srai_epi16(_mm256_add_epi16(V08[i], V09[i]), 1); V41[i] = _mm256_srai_epi16(_mm256_add_epi16(V09[i], V10[i]), 1); V42[i] = _mm256_srai_epi16(_mm256_add_epi16(V10[i], V11[i]), 1); V43[i] = _mm256_srai_epi16(_mm256_add_epi16(V11[i], V12[i]), 1); V44[i] = _mm256_srai_epi16(_mm256_add_epi16(V12[i], V13[i]), 1); V45[i] = _mm256_srai_epi16(_mm256_add_epi16(V13[i], V14[i]), 1); V46[i] = _mm256_srai_epi16(_mm256_add_epi16(V14[i], V15[i]), 1); V47[i] = _mm256_srai_epi16(_mm256_add_epi16(V15[i], V16[i]), 1); V48[i] = _mm256_srai_epi16(_mm256_add_epi16(V16[i], V17[i]), 1); V49[i] = _mm256_srai_epi16(_mm256_add_epi16(V17[i], V18[i]), 1); V50[i] = _mm256_srai_epi16(_mm256_add_epi16(V18[i], V19[i]), 1); V51[i] = _mm256_srai_epi16(_mm256_add_epi16(V19[i], V20[i]), 1); V52[i] = _mm256_srai_epi16(_mm256_add_epi16(V20[i], V21[i]), 1); V53[i] = _mm256_srai_epi16(_mm256_add_epi16(V21[i], V22[i]), 1); V54[i] = _mm256_srai_epi16(_mm256_add_epi16(V22[i], V23[i]), 1); V55[i] = _mm256_srai_epi16(_mm256_add_epi16(V23[i], V24[i]), 1); V56[i] = _mm256_srai_epi16(_mm256_add_epi16(V24[i], V25[i]), 1); V57[i] = _mm256_srai_epi16(_mm256_add_epi16(V25[i], V26[i]), 1); V58[i] = _mm256_srai_epi16(_mm256_add_epi16(V26[i], V27[i]), 1); V59[i] = _mm256_srai_epi16(_mm256_add_epi16(V27[i], V28[i]), 1); V60[i] = _mm256_srai_epi16(_mm256_add_epi16(V28[i], V29[i]), 1); V61[i] = _mm256_srai_epi16(_mm256_add_epi16(V29[i], V30[i]), 1); V62[i] = _mm256_srai_epi16(_mm256_add_epi16(V30[i], V31[i]), 1); V63[i] = _mm256_srai_epi16(_mm256_add_epi16(V31[i], V31[i]), 1); } /*--transposition & Store--*/ //64x64 TRANSPOSE_16x16_16BIT_m256i(V00[0], V32[0], V01[0], V33[0], V02[0], V34[0], V03[0], V35[0], V04[0], V36[0], V05[0], V37[0], V06[0], V38[0], V07[0], V39[0], T00[0], T01[0], T02[0], T03[0], T04[0], T05[0], T06[0], T07[0], T08[0], T09[0], T10[0], T11[0], T12[0], T13[0], T14[0], T15[0]); TRANSPOSE_16x16_16BIT_m256i(V00[1], V32[1], V01[1], V33[1], V02[1], V34[1], V03[1], V35[1], V04[1], V36[1], V05[1], V37[1], V06[1], V38[1], V07[1], V39[1], T16[0], T17[0], T18[0], T19[0], T20[0], T21[0], T22[0], T23[0], T24[0], T25[0], T26[0], T27[0], T28[0], T29[0], T30[0], T31[0]); TRANSPOSE_16x16_16BIT_m256i(V00[2], V32[2], V01[2], V33[2], V02[2], V34[2], V03[2], V35[2], V04[2], V36[2], V05[2], V37[2], V06[2], V38[2], V07[2], V39[2], T32[0], T33[0], T34[0], T35[0], T36[0], T37[0], T38[0], T39[0], T40[0], T41[0], T42[0], T43[0], T44[0], T45[0], T46[0], T47[0]); TRANSPOSE_16x16_16BIT_m256i(V00[3], V32[3], V01[3], V33[3], V02[3], V34[3], V03[3], V35[3], V04[3], V36[3], V05[3], V37[3], V06[3], V38[3], V07[3], V39[3], T48[0], T49[0], T50[0], T51[0], T52[0], T53[0], T54[0], T55[0], T56[0], T57[0], T58[0], T59[0], T60[0], T61[0], T62[0], T63[0]); TRANSPOSE_16x16_16BIT_m256i(V08[0], V40[0], V09[0], V41[0], V10[0], V42[0], V11[0], V43[0], V12[0], V44[0], V13[0], V45[0], V14[0], V46[0], V15[0], V47[0], T00[1], T01[1], T02[1], T03[1], T04[1], T05[1], T06[1], T07[1], T08[1], T09[1], T10[1], T11[1], T12[1], T13[1], T14[1], T15[1]); TRANSPOSE_16x16_16BIT_m256i(V08[1], V40[1], V09[1], V41[1], V10[1], V42[1], V11[1], V43[1], V12[1], V44[1], V13[1], V45[1], V14[1], V46[1], V15[1], V47[1], T16[1], T17[1], T18[1], T19[1], T20[1], T21[1], T22[1], T23[1], T24[1], T25[1], T26[1], T27[1], T28[1], T29[1], T30[1], T31[1]); TRANSPOSE_16x16_16BIT_m256i(V08[2], V40[2], V09[2], V41[2], V10[2], V42[2], V11[2], V43[2], V12[2], V44[2], V13[2], V45[2], V14[2], V46[2], V15[2], V47[2], T32[1], T33[1], T34[1], T35[1], T36[1], T37[1], T38[1], T39[1], T40[1], T41[1], T42[1], T43[1], T44[1], T45[1], T46[1], T47[1]); TRANSPOSE_16x16_16BIT_m256i(V08[3], V40[3], V09[3], V41[3], V10[3], V42[3], V11[3], V43[3], V12[3], V44[3], V13[3], V45[3], V14[3], V46[3], V15[3], V47[3], T48[1], T49[1], T50[1], T51[1], T52[1], T53[1], T54[1], T55[1], T56[1], T57[1], T58[1], T59[1], T60[1], T61[1], T62[1], T63[1]); TRANSPOSE_16x16_16BIT_m256i(V16[0], V48[0], V17[0], V49[0], V18[0], V50[0], V19[0], V51[0], V20[0], V52[0], V21[0], V53[0], V22[0], V54[0], V23[0], V55[0], T00[2], T01[2], T02[2], T03[2], T04[2], T05[2], T06[2], T07[2], T08[2], T09[2], T10[2], T11[2], T12[2], T13[2], T14[2], T15[2]); TRANSPOSE_16x16_16BIT_m256i(V16[1], V48[1], V17[1], V49[1], V18[1], V50[1], V19[1], V51[1], V20[1], V52[1], V21[1], V53[1], V22[1], V54[1], V23[1], V55[1], T16[2], T17[2], T18[2], T19[2], T20[2], T21[2], T22[2], T23[2], T24[2], T25[2], T26[2], T27[2], T28[2], T29[2], T30[2], T31[2]); TRANSPOSE_16x16_16BIT_m256i(V16[2], V48[2], V17[2], V49[2], V18[2], V50[2], V19[2], V51[2], V20[2], V52[2], V21[2], V53[2], V22[2], V54[2], V23[2], V55[2], T32[2], T33[2], T34[2], T35[2], T36[2], T37[2], T38[2], T39[2], T40[2], T41[2], T42[2], T43[2], T44[2], T45[2], T46[2], T47[2]); TRANSPOSE_16x16_16BIT_m256i(V16[3], V48[3], V17[3], V49[3], V18[3], V50[3], V19[3], V51[3], V20[3], V52[3], V21[3], V53[3], V22[3], V54[3], V23[3], V55[3], T48[2], T49[2], T50[2], T51[2], T52[2], T53[2], T54[2], T55[2], T56[2], T57[2], T58[2], T59[2], T60[2], T61[2], T62[2], T63[2]); TRANSPOSE_16x16_16BIT_m256i(V24[0], V56[0], V25[0], V57[0], V26[0], V58[0], V27[0], V59[0], V28[0], V60[0], V29[0], V61[0], V30[0], V62[0], V31[0], V63[0], T00[3], T01[3], T02[3], T03[3], T04[3], T05[3], T06[3], T07[3], T08[3], T09[3], T10[3], T11[3], T12[3], T13[3], T14[3], T15[3]); TRANSPOSE_16x16_16BIT_m256i(V24[1], V56[1], V25[1], V57[1], V26[1], V58[1], V27[1], V59[1], V28[1], V60[1], V29[1], V61[1], V30[1], V62[1], V31[1], V63[1], T16[3], T17[3], T18[3], T19[3], T20[3], T21[3], T22[3], T23[3], T24[3], T25[3], T26[3], T27[3], T28[3], T29[3], T30[3], T31[3]); TRANSPOSE_16x16_16BIT_m256i(V24[2], V56[2], V25[2], V57[2], V26[2], V58[2], V27[2], V59[2], V28[2], V60[2], V29[2], V61[2], V30[2], V62[2], V31[2], V63[2], T32[3], T33[3], T34[3], T35[3], T36[3], T37[3], T38[3], T39[3], T40[3], T41[3], T42[3], T43[3], T44[3], T45[3], T46[3], T47[3]); TRANSPOSE_16x16_16BIT_m256i(V24[3], V56[3], V25[3], V57[3], V26[3], V58[3], V27[3], V59[3], V28[3], V60[3], V29[3], V61[3], V30[3], V62[3], V31[3], V63[3], T48[3], T49[3], T50[3], T51[3], T52[3], T53[3], T54[3], T55[3], T56[3], T57[3], T58[3], T59[3], T60[3], T61[3], T62[3], T63[3]); //store for (i = 0; i < 4; i++) { _mm256_storeu_si256((__m256i*)&coeff[16 * i], T00[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64], T01[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 2], T02[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 3], T03[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 4], T04[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 5], T05[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 6], T06[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 7], T07[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 8], T08[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 9], T09[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 10], T10[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 11], T11[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 12], T12[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 13], T13[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 14], T14[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 15], T15[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 16], T16[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 17], T17[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 18], T18[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 19], T19[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 20], T20[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 21], T21[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 22], T22[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 23], T23[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 24], T24[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 25], T25[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 26], T26[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 27], T27[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 28], T28[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 29], T29[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 30], T30[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 31], T31[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 32], T32[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 33], T33[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 34], T34[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 35], T35[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 36], T36[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 37], T37[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 38], T38[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 39], T39[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 40], T40[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 41], T41[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 42], T42[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 43], T43[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 44], T44[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 45], T45[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 46], T46[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 47], T47[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 48], T48[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 49], T49[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 50], T50[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 51], T51[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 52], T52[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 53], T53[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 54], T54[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 55], T55[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 56], T56[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 57], T57[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 58], T58[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 59], T59[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 60], T60[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 61], T61[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 62], T62[i]); _mm256_storeu_si256((__m256i*)&coeff[16 * i + 64 * 63], T63[i]); } } /* --------------------------------------------------------------------------- */ void idct_64x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x32_avx2(src, dst, 32 | 0x01); //TODO: change the code to avx2 inv_wavelet_64x64_avx2(dst); } /* --------------------------------------------------------------------------- */ void idct_64x16_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_32x8_sse128(src, dst, 32 | 0x01);//TODO: change the code to avx2 inv_wavelet_64x16_avx2(dst); } /* --------------------------------------------------------------------------- */ void idct_16x64_avx2(const coeff_t *src, coeff_t *dst, int i_dst) { UNUSED_PARAMETER(i_dst); idct_8x32_sse128(src, dst, 8 | 0x01);//TODO: change the code to avx2 inv_wavelet_16x64_avx2(dst); } davs2-1.6/source/common/vec/intrinsic_inter_pred.cc000066400000000000000000006555241337322544400225200ustar00rootroot00000000000000/* * intrinsic_inter-pred.cc * * Description of this file: * SSE assembly functions of Inter-Prediction module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include #if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const int16_t offset = 32; const int shift = 6; int row, col; const __m128i mAddOffset = _mm_set1_epi16(offset); const __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); const __m128i mSwitch2 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m128i mCoef = _mm_set1_epi32(*(int*)coeff); const __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 1; for (row = 0; row < height; row++) { __m128i mSrc, mT20, mT40, mVal; for (col = 0; col < width - 7; col += 8) { mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT20 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoef); mT40 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoef); mVal = _mm_hadd_epi16(mT20, mT40); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); } if (col < width) { mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT20 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoef); mT40 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoef); mVal = _mm_hadd_epi16(mT20, mT40); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef = _mm_loadl_epi64((__m128i*)coeff); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); mCoef = _mm_unpacklo_epi64(mCoef, mCoef); src -= 3; for (row = 0; row < height; row++) { __m128i srcCoeff, T20, T40, T60, T80, sum; for (col = 0; col < width - 7; col += 8) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); T20 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch1), mCoef); T40 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch2), mCoef); T60 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch3), mCoef); T80 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch4), mCoef); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); sum = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); sum = _mm_packus_epi16(sum, sum); _mm_storel_epi64((__m128i*)&dst[col], sum); } if (col < width) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); T20 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch1), mCoef); T40 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch2), mCoef); T60 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch3), mCoef); T80 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff, mSwitch4), mCoef); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); sum = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); sum = _mm_packus_epi16(sum, sum); _mm_maskmoveu_si128(sum, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef = _mm_loadl_epi64((__m128i*)coeff); mCoef = _mm_unpacklo_epi64(mCoef, mCoef); __m128i T01, T23, T45, T67, T89, Tab, Tcd, Tef; __m128i S1, S2, S3, S4; __m128i U0, U1; __m128i Val1, Val2, Val; src -= 3; for (row = 0; row < height; row++) { for (col = 0; col < width - 8; col += 16) { __m128i srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); __m128i srcCoeff2 = _mm_loadu_si128((__m128i*)(src + col + 8)); T01 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T23 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T45 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T67 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); S1 = _mm_hadd_epi16(T01, T23); S2 = _mm_hadd_epi16(T45, T67); U0 = _mm_hadd_epi16(S1, S2); _mm_store_si128((__m128i*)&tmp[col], U0); T89 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch1), mCoef); Tab = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch2), mCoef); Tcd = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch3), mCoef); Tef = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff2, mSwitch4), mCoef); S3 = _mm_hadd_epi16(T89, Tab); S4 = _mm_hadd_epi16(Tcd, Tef); U1 = _mm_hadd_epi16(S3, S4); _mm_store_si128((__m128i*)&tmp[col + 8], U1); Val1 = _mm_add_epi16(U0, mAddOffset); Val2 = _mm_add_epi16(U1, mAddOffset); Val1 = _mm_srai_epi16(Val1, shift); Val2 = _mm_srai_epi16(Val2, shift); Val = _mm_packus_epi16(Val1, Val2); _mm_storeu_si128((__m128i*)&dst[col], Val); } if (col < width) { __m128i srcCoeff1 = _mm_loadu_si128((__m128i*)(src + col)); T01 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T23 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T45 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T67 = _mm_maddubs_epi16(_mm_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); S1 = _mm_hadd_epi16(T01, T23); S2 = _mm_hadd_epi16(T45, T67); U0 = _mm_hadd_epi16(S1, S2); _mm_store_si128((__m128i*)&tmp[col], U0); Val1 = _mm_add_epi16(U0, mAddOffset); Val1 = _mm_srai_epi16(Val1, shift); Val = _mm_packus_epi16(Val1, Val1); _mm_store_si128((__m128i*)&dst[col], Val); } src += i_src; tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoef0 = _mm_loadl_epi64((__m128i*)coeff[0]); __m128i mCoef1 = _mm_loadl_epi64((__m128i*)coeff[1]); __m128i mCoef2 = _mm_loadl_epi64((__m128i*)coeff[2]); mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); mCoef0 = _mm_unpacklo_epi64(mCoef0, mCoef0); mCoef1 = _mm_unpacklo_epi64(mCoef1, mCoef1); mCoef2 = _mm_unpacklo_epi64(mCoef2, mCoef2); src -= 3; for (row = 0; row < height; row++) { __m128i TC1, TC2, TC3, TC4; __m128i T20, T40, T60, T80, sum, val; __m128i srcCoeff; for (col = 0; col < width - 7; col += 8) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); TC1 = _mm_shuffle_epi8(srcCoeff, mSwitch1); TC2 = _mm_shuffle_epi8(srcCoeff, mSwitch2); TC3 = _mm_shuffle_epi8(srcCoeff, mSwitch3); TC4 = _mm_shuffle_epi8(srcCoeff, mSwitch4); // First T20 = _mm_maddubs_epi16(TC1, mCoef0); T40 = _mm_maddubs_epi16(TC2, mCoef0); T60 = _mm_maddubs_epi16(TC3, mCoef0); T80 = _mm_maddubs_epi16(TC4, mCoef0); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp0[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_storel_epi64((__m128i*)&dst0[col], val); // Second T20 = _mm_maddubs_epi16(TC1, mCoef1); T40 = _mm_maddubs_epi16(TC2, mCoef1); T60 = _mm_maddubs_epi16(TC3, mCoef1); T80 = _mm_maddubs_epi16(TC4, mCoef1); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp1[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_storel_epi64((__m128i*)&dst1[col], val); // Third T20 = _mm_maddubs_epi16(TC1, mCoef2); T40 = _mm_maddubs_epi16(TC2, mCoef2); T60 = _mm_maddubs_epi16(TC3, mCoef2); T80 = _mm_maddubs_epi16(TC4, mCoef2); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp2[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_storel_epi64((__m128i*)&dst2[col], val); } if (col < width) { srcCoeff = _mm_loadu_si128((__m128i*)(src + col)); TC1 = _mm_shuffle_epi8(srcCoeff, mSwitch1); TC2 = _mm_shuffle_epi8(srcCoeff, mSwitch2); TC3 = _mm_shuffle_epi8(srcCoeff, mSwitch3); TC4 = _mm_shuffle_epi8(srcCoeff, mSwitch4); // First T20 = _mm_maddubs_epi16(TC1, mCoef0); T40 = _mm_maddubs_epi16(TC2, mCoef0); T60 = _mm_maddubs_epi16(TC3, mCoef0); T80 = _mm_maddubs_epi16(TC4, mCoef0); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp0[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask, (char *)&dst0[col]); // Second T20 = _mm_maddubs_epi16(TC1, mCoef1); T40 = _mm_maddubs_epi16(TC2, mCoef1); T60 = _mm_maddubs_epi16(TC3, mCoef1); T80 = _mm_maddubs_epi16(TC4, mCoef1); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp1[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask, (char *)&dst1[col]); // Third T20 = _mm_maddubs_epi16(TC1, mCoef2); T40 = _mm_maddubs_epi16(TC2, mCoef2); T60 = _mm_maddubs_epi16(TC3, mCoef2); T80 = _mm_maddubs_epi16(TC4, mCoef2); sum = _mm_hadd_epi16(_mm_hadd_epi16(T20, T40), _mm_hadd_epi16(T60, T80)); _mm_store_si128((__m128i*)(&tmp2[col]), sum); val = _mm_srai_epi16(_mm_add_epi16(sum, mAddOffset), shift); val = _mm_packus_epi16(val, val); _mm_maskmoveu_si128(val, mask, (char *)&dst2[col]); } src += i_src; tmp0 += i_tmp; tmp1 += i_tmp; tmp2 += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_VER_SSE128_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm_maddubs_epi16(D0, W0); \ T1 = _mm_maddubs_epi16(D1, W1); \ T2 = _mm_maddubs_epi16(D2, W2); \ T3 = _mm_maddubs_epi16(D3, W3); \ T4 = _mm_maddubs_epi16(D4, W4); \ T5 = _mm_maddubs_epi16(D5, W5); \ T6 = _mm_maddubs_epi16(D6, W6); \ T7 = _mm_maddubs_epi16(D7, W7); \ \ mVal1 = _mm_add_epi16(T0, T1); \ mVal1 = _mm_add_epi16(mVal1, T2); \ mVal1 = _mm_add_epi16(mVal1, T3); \ \ mVal2 = _mm_add_epi16(T4, T5); \ mVal2 = _mm_add_epi16(mVal2, T6); \ mVal2 = _mm_add_epi16(mVal2, T7); \ \ mVal1 = _mm_add_epi16(mVal1, mAddOffset); \ mVal2 = _mm_add_epi16(mVal2, mAddOffset); \ mVal1 = _mm_srai_epi16(mVal1, shift); \ mVal2 = _mm_srai_epi16(mVal2, shift); \ result = _mm_packus_epi16(mVal1, mVal2); #define INTPL_LUMA_VER_SSE128_STORE(result, store_dst) \ _mm_storeu_si128((__m128i*)&(store_dst)[col], result); #define INTPL_LUMA_VER_SSE128_COMPUT_LO(W0,W1,W2,W3,result) \ T0 = _mm_maddubs_epi16(D0, W0); \ T1 = _mm_maddubs_epi16(D1, W1); \ T2 = _mm_maddubs_epi16(D2, W2); \ T3 = _mm_maddubs_epi16(D3, W3); \ \ mVal1 = _mm_add_epi16(T0, T1); \ mVal1 = _mm_add_epi16(mVal1, T2); \ mVal1 = _mm_add_epi16(mVal1, T3); \ \ mVal1 = _mm_add_epi16(mVal1, mAddOffset); \ mVal1 = _mm_srai_epi16(mVal1, shift); \ result = _mm_packus_epi16(mVal1, mVal1); void intpl_luma_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col; const short offset = 32; const int shift = 6; __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; src -= 3 * i_src; int8_t coeff_tmp[2]; coeff_tmp[0] = coeff[7],coeff_tmp[1] = coeff[0]; __m128i coeff70 = _mm_set1_epi16(*(short*)coeff_tmp); __m128i coeff12 = _mm_set1_epi16(*(short*)(coeff + 1)); __m128i coeff34 = _mm_set1_epi16(*(short*)(coeff + 3)); __m128i coeff56 = _mm_set1_epi16(*(short*)(coeff + 5)); __m128i coeff01 = _mm_set1_epi16(*(short*)coeff); __m128i coeff23 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i coeff45 = _mm_set1_epi16(*(short*)(coeff + 4)); __m128i coeff67 = _mm_set1_epi16(*(short*)(coeff + 6)); __m128i mVal1, mVal2; __m128i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i D0, D1, D2, D3, D4, D5, D6, D7; __m128i U0, U1, U2, U3; for (row = 0; row < height; row = row + 4) { p = src; for (col = 0; col < width - 8; col += 16) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_src)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //0 D0 = _mm_unpacklo_epi8(T00, T10); D1 = _mm_unpacklo_epi8(T20, T30); D2 = _mm_unpacklo_epi8(T40, T50); D3 = _mm_unpacklo_epi8(T60, T70); D4 = _mm_unpackhi_epi8(T00, T10); D5 = _mm_unpackhi_epi8(T20, T30); D6 = _mm_unpackhi_epi8(T40, T50); D7 = _mm_unpackhi_epi8(T60, T70); INTPL_LUMA_VER_SSE128_COMPUT(coeff01, coeff23, coeff45, coeff67, coeff01, coeff23, coeff45, coeff67, U0); INTPL_LUMA_VER_SSE128_STORE(U0, dst); //1 D0 = _mm_unpacklo_epi8(T80, T10); D4 = _mm_unpackhi_epi8(T80, T10); INTPL_LUMA_VER_SSE128_COMPUT(coeff70, coeff12, coeff34, coeff56, coeff70, coeff12, coeff34, coeff56, U1); INTPL_LUMA_VER_SSE128_STORE(U1, dst + i_dst); //2 D0 = _mm_unpacklo_epi8(T80, T90); D4 = _mm_unpackhi_epi8(T80, T90); INTPL_LUMA_VER_SSE128_COMPUT(coeff67, coeff01, coeff23, coeff45, coeff67, coeff01, coeff23, coeff45, U2); INTPL_LUMA_VER_SSE128_STORE(U2, dst + 2 * i_dst); //3 D1 = _mm_unpacklo_epi8(Ta0, T30); D5 = _mm_unpackhi_epi8(Ta0, T30); INTPL_LUMA_VER_SSE128_COMPUT(coeff56, coeff70, coeff12, coeff34, coeff56, coeff70, coeff12, coeff34, U3); INTPL_LUMA_VER_SSE128_STORE(U3, dst + 3 * i_dst); p += 16; } //<=8bit if (col < width) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_src)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //0 D0 = _mm_unpacklo_epi8(T00, T10); D1 = _mm_unpacklo_epi8(T20, T30); D2 = _mm_unpacklo_epi8(T40, T50); D3 = _mm_unpacklo_epi8(T60, T70); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff01, coeff23, coeff45, coeff67, U0); INTPL_LUMA_VER_SSE128_STORE(U0, dst); //1 D0 = _mm_unpacklo_epi8(T80, T10); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff70, coeff12, coeff34, coeff56, U1); INTPL_LUMA_VER_SSE128_STORE(U1, dst + i_dst); //2 D0 = _mm_unpacklo_epi8(T80, T90); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff67, coeff01, coeff23, coeff45, U2); INTPL_LUMA_VER_SSE128_STORE(U2, dst + 2 * i_dst); //3 D1 = _mm_unpacklo_epi8(Ta0, T30); INTPL_LUMA_VER_SSE128_COMPUT_LO(coeff56, coeff70, coeff12, coeff34, U3); INTPL_LUMA_VER_SSE128_STORE(U3, dst + 3 * i_dst); p += 8; col += 8; } src += i_src * 4; dst += i_dst * 4; } } /* --------------------------------------------------------------------------- * */ void intpl_luma_ver_x3_sse128(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, int8_t const **coeff) { /* intpl_luma_ver_sse128(dst0, i_dst, src, i_src, width, height, coeff[0]); intpl_luma_ver_sse128(dst1, i_dst, src, i_src, width, height, coeff[1]); intpl_luma_ver_sse128(dst2, i_dst, src, i_src, width, height, coeff[2]); */ int row, col; const short offset = 32; const int shift = 6; int bsymFirst = (coeff[0][1] == coeff[0][6]); int bsymSecond = (coeff[1][1] == coeff[1][6]); int bsymThird = (coeff[2][1] == coeff[2][6]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; __m128i coeffFirst0, coeffFirst1, coeffFirst2, coeffFirst3; __m128i coeffSecond0, coeffSecond1, coeffSecond2, coeffSecond3; __m128i coeffThird0, coeffThird1, coeffThird2, coeffThird3; __m128i tempT00, tempT10, tempT20, tempT30; __m128i mVal; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; //load Coefficient if (bsymFirst) { coeffFirst0 = _mm_set1_epi8(coeff[0][0]); coeffFirst1 = _mm_set1_epi8(coeff[0][1]); coeffFirst2 = _mm_set1_epi8(coeff[0][2]); coeffFirst3 = _mm_set1_epi8(coeff[0][3]); } else { coeffFirst0 = _mm_set1_epi16(*(short*)coeff[0]); coeffFirst1 = _mm_set1_epi16(*(short*)(coeff[0] + 2)); coeffFirst2 = _mm_set1_epi16(*(short*)(coeff[0] + 4)); coeffFirst3 = _mm_set1_epi16(*(short*)(coeff[0] + 6)); } if (bsymSecond) { coeffSecond0 = _mm_set1_epi8(coeff[1][0]); coeffSecond1 = _mm_set1_epi8(coeff[1][1]); coeffSecond2 = _mm_set1_epi8(coeff[1][2]); coeffSecond3 = _mm_set1_epi8(coeff[1][3]); } else { coeffSecond0 = _mm_set1_epi16(*(short*)coeff[1]); coeffSecond1 = _mm_set1_epi16(*(short*)(coeff[1] + 2)); coeffSecond2 = _mm_set1_epi16(*(short*)(coeff[1] + 4)); coeffSecond3 = _mm_set1_epi16(*(short*)(coeff[1] + 6)); } if (bsymThird) { coeffThird0 = _mm_set1_epi8(coeff[2][0]); coeffThird1 = _mm_set1_epi8(coeff[2][1]); coeffThird2 = _mm_set1_epi8(coeff[2][2]); coeffThird3 = _mm_set1_epi8(coeff[2][3]); } else { coeffThird0 = _mm_set1_epi16(*(short*)coeff[2]); coeffThird1 = _mm_set1_epi16(*(short*)(coeff[2] + 2)); coeffThird2 = _mm_set1_epi16(*(short*)(coeff[2] + 4)); coeffThird3 = _mm_set1_epi16(*(short*)(coeff[2] + 6)); } //Double For for (row = 0; row < height - 3; row += 4) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //First if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst0[col], mVal); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + i_dst), mVal); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + 2 * i_dst), mVal); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + i_dst), mVal); //Second if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst1[col], mVal); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + i_dst), mVal); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + 2 * i_dst), mVal); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + 3 * i_dst), mVal); //Third if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst2[col], mVal); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + i_dst), mVal); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + 2 * i_dst), mVal); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); //First if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst0[col]); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + i_dst)); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + 2 * i_dst)); if (bsymFirst) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffFirst0); tempT10 = _mm_maddubs_epi16(tempT10, coeffFirst1); tempT20 = _mm_maddubs_epi16(tempT20, coeffFirst2); tempT30 = _mm_maddubs_epi16(tempT30, coeffFirst3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + 3 * i_dst)); //Second if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst1[col]); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + i_dst)); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + 2 * i_dst)); if (bsymSecond) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffSecond0); tempT10 = _mm_maddubs_epi16(tempT10, coeffSecond1); tempT20 = _mm_maddubs_epi16(tempT20, coeffSecond2); tempT30 = _mm_maddubs_epi16(tempT30, coeffSecond3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + 3 * i_dst)); //Third if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T00, T70); tempT10 = _mm_unpacklo_epi8(T10, T60); tempT20 = _mm_unpacklo_epi8(T20, T50); tempT30 = _mm_unpacklo_epi8(T30, T40); } else { tempT00 = _mm_unpacklo_epi8(T00, T10); tempT10 = _mm_unpacklo_epi8(T20, T30); tempT20 = _mm_unpacklo_epi8(T40, T50); tempT30 = _mm_unpacklo_epi8(T60, T70); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst2[col]); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T10, T80); tempT10 = _mm_unpacklo_epi8(T20, T70); tempT20 = _mm_unpacklo_epi8(T30, T60); tempT30 = _mm_unpacklo_epi8(T40, T50); } else { tempT00 = _mm_unpacklo_epi8(T10, T20); tempT10 = _mm_unpacklo_epi8(T30, T40); tempT20 = _mm_unpacklo_epi8(T50, T60); tempT30 = _mm_unpacklo_epi8(T70, T80); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + i_dst)); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T20, T90); tempT10 = _mm_unpacklo_epi8(T30, T80); tempT20 = _mm_unpacklo_epi8(T40, T70); tempT30 = _mm_unpacklo_epi8(T50, T60); } else { tempT00 = _mm_unpacklo_epi8(T20, T30); tempT10 = _mm_unpacklo_epi8(T40, T50); tempT20 = _mm_unpacklo_epi8(T60, T70); tempT30 = _mm_unpacklo_epi8(T80, T90); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + 2 * i_dst)); if (bsymThird) { tempT00 = _mm_unpacklo_epi8(T30, Ta0); tempT10 = _mm_unpacklo_epi8(T40, T90); tempT20 = _mm_unpacklo_epi8(T50, T80); tempT30 = _mm_unpacklo_epi8(T60, T70); } else { tempT00 = _mm_unpacklo_epi8(T30, T40); tempT10 = _mm_unpacklo_epi8(T50, T60); tempT20 = _mm_unpacklo_epi8(T70, T80); tempT30 = _mm_unpacklo_epi8(T90, Ta0); } tempT00 = _mm_maddubs_epi16(tempT00, coeffThird0); tempT10 = _mm_maddubs_epi16(tempT10, coeffThird1); tempT20 = _mm_maddubs_epi16(tempT20, coeffThird2); tempT30 = _mm_maddubs_epi16(tempT30, coeffThird3); mVal = _mm_add_epi16(tempT00, tempT10); mVal = _mm_add_epi16(mVal, tempT20); mVal = _mm_add_epi16(mVal, tempT30); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + 3 * i_dst)); } src += 4 * i_src; dst0 += 4 * i_dst; dst1 += 4 * i_dst; dst2 += 4 * i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ext_sse128(pel_t *dst, int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { int row, col; int shift; int16_t const *p; int bsymy = (coeff[1] == coeff[6]); __m128i mAddOffset; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << (shift - 1)); tmp = tmp - 3 * i_tmp; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coeff[0]); __m128i mCoefy2 = _mm_set1_epi16(coeff[1]); __m128i mCoefy3 = _mm_set1_epi16(coeff[2]); __m128i mCoefy4 = _mm_set1_epi16(coeff[3]); __m128i mVal1, mVal2, mVal; for (row = 0; row < height - 3; row += 4) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } tmp += 4 * i_tmp; dst += 4 * i_dst; } } else { __m128i mCoefy1 = _mm_set1_epi16(*(int16_t*)(coeff + 0)); __m128i mCoefy2 = _mm_set1_epi16(*(int16_t*)(coeff + 2)); __m128i mCoefy3 = _mm_set1_epi16(*(int16_t*)(coeff + 4)); __m128i mCoefy4 = _mm_set1_epi16(*(int16_t*)(coeff + 6)); __m128i mVal1, mVal2, mVal; mCoefy1 = _mm_cvtepi8_epi16(mCoefy1); mCoefy2 = _mm_cvtepi8_epi16(mCoefy2); mCoefy3 = _mm_cvtepi8_epi16(mCoefy3); mCoefy4 = _mm_cvtepi8_epi16(mCoefy4); for (row = 0; row < height; row++) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } tmp += 4 * i_tmp; dst += 4 * i_dst; } } } void intpl_luma_ext_x3_sse128(pel_t *const dst[3], int i_dst, mct_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { /* intpl_luma_ext_sse128(dst0, i_dst, tmp, i_tmp, width, height, coeff[0]); intpl_luma_ext_sse128(dst1, i_dst, tmp, i_tmp, width, height, coeff[1]); intpl_luma_ext_sse128(dst2, i_dst, tmp, i_tmp, width, height, coeff[2]); */ int row, col; int shift; int16_t const *p; int bsymyFirst = (coeff[0][1] == coeff[0][6]); int bsymySecond = (coeff[1][1] == coeff[1][6]); int bsymyThird = (coeff[2][1] == coeff[2][6]); __m128i mAddOffset; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << (shift - 1)); tmp = tmp - 3 * i_tmp; __m128i mCoefy1First,mCoefy2First,mCoefy3First,mCoefy4First; __m128i mCoefy1Second,mCoefy2Second,mCoefy3Second,mCoefy4Second; __m128i mCoefy1Third,mCoefy2Third,mCoefy3Third,mCoefy4Third; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; if(bsymyFirst) { mCoefy1First = _mm_set1_epi16(coeff[0][0]); mCoefy2First = _mm_set1_epi16(coeff[0][1]); mCoefy3First = _mm_set1_epi16(coeff[0][2]); mCoefy4First = _mm_set1_epi16(coeff[0][3]); } else { mCoefy1First = _mm_set1_epi16(*(int16_t*)coeff[0]); mCoefy2First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 2)); mCoefy3First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 4)); mCoefy4First = _mm_set1_epi16(*(int16_t*)(coeff[0] + 6)); mCoefy1First = _mm_cvtepi8_epi16(mCoefy1First); mCoefy2First = _mm_cvtepi8_epi16(mCoefy2First); mCoefy3First = _mm_cvtepi8_epi16(mCoefy3First); mCoefy4First = _mm_cvtepi8_epi16(mCoefy4First); } if(bsymySecond) { mCoefy1Second = _mm_set1_epi16(coeff[1][0]); mCoefy2Second = _mm_set1_epi16(coeff[1][1]); mCoefy3Second = _mm_set1_epi16(coeff[1][2]); mCoefy4Second = _mm_set1_epi16(coeff[1][3]); } else { mCoefy1Second = _mm_set1_epi16(*(int16_t*)coeff[1]); mCoefy2Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 2)); mCoefy3Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 4)); mCoefy4Second = _mm_set1_epi16(*(int16_t*)(coeff[1] + 6)); mCoefy1Second = _mm_cvtepi8_epi16(mCoefy1Second); mCoefy2Second = _mm_cvtepi8_epi16(mCoefy2Second); mCoefy3Second = _mm_cvtepi8_epi16(mCoefy3Second); mCoefy4Second = _mm_cvtepi8_epi16(mCoefy4Second); } if(bsymyThird) { mCoefy1Third = _mm_set1_epi16(coeff[2][0]); mCoefy2Third = _mm_set1_epi16(coeff[2][1]); mCoefy3Third = _mm_set1_epi16(coeff[2][2]); mCoefy4Third = _mm_set1_epi16(coeff[2][3]); } else { mCoefy1Third = _mm_set1_epi16(*(int16_t*)coeff[2]); mCoefy2Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 2)); mCoefy3Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 4)); mCoefy4Third = _mm_set1_epi16(*(int16_t*)(coeff[2] + 6)); mCoefy1Third = _mm_cvtepi8_epi16(mCoefy1Third); mCoefy2Third = _mm_cvtepi8_epi16(mCoefy2Third); mCoefy3Third = _mm_cvtepi8_epi16(mCoefy3Third); mCoefy4Third = _mm_cvtepi8_epi16(mCoefy4Third); } __m128i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m128i T0, T1, T2, T3, T4, T5, T6, T7; __m128i mVal1, mVal2, mVal; // for (row = 0; row < height - 3 ; row += 4) { p = tmp; for (col = 0; col < width - 7; col += 8) { T00 = _mm_loadu_si128((__m128i*)(p)); T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); //First if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst0[col], mVal); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + i_dst), mVal); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + 2 * i_dst), mVal); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst0[col] + 3 * i_dst), mVal); //Second if (bsymySecond) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst1[col], mVal); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + i_dst), mVal); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + 2 * i_dst), mVal); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst1[col] + 3 * i_dst), mVal); //Third if (bsymyThird) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst2[col], mVal); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + i_dst), mVal); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + 2 * i_dst), mVal); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst2[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); //First if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst0[col]); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + i_dst)); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T20, T30); T3 = _mm_unpacklo_epi16(T20, T30); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T20, T30); T7 = _mm_unpackhi_epi16(T20, T30); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + 2 * i_dst)); if (bsymyFirst) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1First); T1 = _mm_madd_epi16(T1, mCoefy2First); T2 = _mm_madd_epi16(T2, mCoefy3First); T3 = _mm_madd_epi16(T3, mCoefy4First); T4 = _mm_madd_epi16(T4, mCoefy1First); T5 = _mm_madd_epi16(T5, mCoefy2First); T6 = _mm_madd_epi16(T6, mCoefy3First); T7 = _mm_madd_epi16(T7, mCoefy4First); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst0[col] + 3 * i_dst)); //Second if (bsymySecond) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst1[col]); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + i_dst)); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + 2 * i_dst)); if (bsymySecond) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1Second); T1 = _mm_madd_epi16(T1, mCoefy2Second); T2 = _mm_madd_epi16(T2, mCoefy3Second); T3 = _mm_madd_epi16(T3, mCoefy4Second); T4 = _mm_madd_epi16(T4, mCoefy1Second); T5 = _mm_madd_epi16(T5, mCoefy2Second); T6 = _mm_madd_epi16(T6, mCoefy3Second); T7 = _mm_madd_epi16(T7, mCoefy4Second); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst1[col] + 3 * i_dst)); //Third if (bsymyThird) { T0 = _mm_unpacklo_epi16(T00, T70); T1 = _mm_unpacklo_epi16(T10, T60); T2 = _mm_unpacklo_epi16(T20, T50); T3 = _mm_unpacklo_epi16(T30, T40); T4 = _mm_unpackhi_epi16(T00, T70); T5 = _mm_unpackhi_epi16(T10, T60); T6 = _mm_unpackhi_epi16(T20, T50); T7 = _mm_unpackhi_epi16(T30, T40); } else { T0 = _mm_unpacklo_epi16(T00, T10); T1 = _mm_unpacklo_epi16(T20, T30); T2 = _mm_unpacklo_epi16(T40, T50); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T00, T10); T5 = _mm_unpackhi_epi16(T20, T30); T6 = _mm_unpackhi_epi16(T40, T50); T7 = _mm_unpackhi_epi16(T60, T70); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst2[col]); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); } else { T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + i_dst)); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); } else { T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + 2 * i_dst)); if (bsymyThird) { T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); } else { T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); } T0 = _mm_madd_epi16(T0, mCoefy1Third); T1 = _mm_madd_epi16(T1, mCoefy2Third); T2 = _mm_madd_epi16(T2, mCoefy3Third); T3 = _mm_madd_epi16(T3, mCoefy4Third); T4 = _mm_madd_epi16(T4, mCoefy1Third); T5 = _mm_madd_epi16(T5, mCoefy2Third); T6 = _mm_madd_epi16(T6, mCoefy3Third); T7 = _mm_madd_epi16(T7, mCoefy4Third); mVal1 = _mm_add_epi32(T0, T1); mVal1 = _mm_add_epi32(mVal1, T2); mVal1 = _mm_add_epi32(mVal1, T3); mVal2 = _mm_add_epi32(T4, T5); mVal2 = _mm_add_epi32(mVal2, T6); mVal2 = _mm_add_epi32(mVal2, T7); mVal1 = _mm_add_epi32(mVal1, mAddOffset); mVal2 = _mm_add_epi32(mVal2, mAddOffset); mVal1 = _mm_srai_epi32(mVal1, shift); mVal2 = _mm_srai_epi32(mVal2, shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst2[col] + 3 * i_dst)); } tmp += 4 * i_tmp; dst0 += 4 * i_dst; dst1 += 4 * i_dst; dst2 += 4 * i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const short offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= i_src; if (bsym) { __m128i coeff0 = _mm_set1_epi8(coeff[0]); __m128i coeff1 = _mm_set1_epi8(coeff[1]); __m128i mVal; for (row = 0; row < height - 3; row += 4) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T0 = _mm_unpacklo_epi8(T00, T30); __m128i T1 = _mm_unpacklo_epi8(T10, T20); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi8(T10, T40); T1 = _mm_unpacklo_epi8(T20, T30); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi8(T20, T50); T1 = _mm_unpacklo_epi8(T30, T40); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi8(T30, T60); T1 = _mm_unpacklo_epi8(T40, T50); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T0 = _mm_unpacklo_epi8(T00, T30); __m128i T1 = _mm_unpacklo_epi8(T10, T20); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi8(T10, T40); T1 = _mm_unpacklo_epi8(T20, T30); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi8(T20, T50); T1 = _mm_unpacklo_epi8(T30, T40); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi8(T30, T60); T1 = _mm_unpacklo_epi8(T40, T50); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } src += 4 * i_src; dst += 4 * i_dst; } for (; row < height; row++) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T30); T10 = _mm_unpacklo_epi8(T10, T20); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T30); T10 = _mm_unpacklo_epi8(T10, T20); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coeff); __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i mVal; for (row = 0; row < height - 3; row += 4) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T0 = _mm_unpacklo_epi8(T00, T10); __m128i T1 = _mm_unpacklo_epi8(T20, T30); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi8(T10, T20); T1 = _mm_unpacklo_epi8(T30, T40); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi8(T20, T30); T1 = _mm_unpacklo_epi8(T40, T50); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi8(T30, T40); T1 = _mm_unpacklo_epi8(T50, T60); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T0 = _mm_unpacklo_epi8(T00, T10); __m128i T1 = _mm_unpacklo_epi8(T20, T30); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi8(T10, T20); T1 = _mm_unpacklo_epi8(T30, T40); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi8(T20, T30); T1 = _mm_unpacklo_epi8(T40, T50); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi8(T30, T40); T1 = _mm_unpacklo_epi8(T50, T60); T0 = _mm_maddubs_epi16(T0, coeff0); T1 = _mm_maddubs_epi16(T1, coeff1); mVal = _mm_add_epi16(T0, T1); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } src += 4 * i_src; dst += 4 * i_dst; } for (; row < height; row++) { p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T10); T10 = _mm_unpacklo_epi8(T20, T30); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); T00 = _mm_unpacklo_epi8(T00, T10); T10 = _mm_unpacklo_epi8(T20, T30); T00 = _mm_maddubs_epi16(T00, coeff0); T10 = _mm_maddubs_epi16(T10, coeff1); mVal = _mm_add_epi16(T00, T10); mVal = _mm_add_epi16(mVal, mAddOffset); mVal = _mm_srai_epi16(mVal, shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const short offset = 32; const int shift = 6; int row, col; int bsym = (coeff[1] == coeff[6]); __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; if (bsym) { __m128i coeff0 = _mm_set1_epi8(coeff[0]); __m128i coeff1 = _mm_set1_epi8(coeff[1]); __m128i coeff2 = _mm_set1_epi8(coeff[2]); __m128i coeff3 = _mm_set1_epi8(coeff[3]); for (row = 0; row < height - 3; row += 4) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); __m128i T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T70), coeff0); __m128i T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T60), coeff1); __m128i T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); __m128i T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T80), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T70), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T60), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T90), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T80), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T70), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, Ta0), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T90), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T80), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); __m128i T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T70), coeff0); __m128i T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T60), coeff1); __m128i T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); __m128i T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T80), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T70), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T60), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T90), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T80), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T70), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, Ta0), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T90), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T80), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } src += 4 * i_src; dst += 4 * i_dst; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coeff); __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i coeff2 = _mm_set1_epi16(*(short*)(coeff + 4)); __m128i coeff3 = _mm_set1_epi16(*(short*)(coeff + 6)); for (row = 0; row < height - 3; row += 4) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); __m128i T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T10), coeff0); __m128i T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); __m128i T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); __m128i T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T20), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T70, T80), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T80, T90), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T70, T80), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T90, Ta0), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_src)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_src)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_src)); __m128i T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T00, T10), coeff0); __m128i T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); __m128i T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); __m128i T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T10, T20), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T70, T80), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T60, T70), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T80, T90), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff0); T2 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T50, T60), coeff1); T3 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T70, T80), coeff2); T4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T90, Ta0), coeff3); mVal = _mm_add_epi16(_mm_add_epi16(T1, T2), _mm_add_epi16(T3, T4)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } src += 4 * i_src; dst += 4 * i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver0_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) {//-1, 4, -10, 57, 19, -7, 3, -1 const short offset = 32; const int shift = 6; int row, col; __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; //__m128i coeff0 = _mm_set1_epi16(*(short*)coeff);//-1 4 __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2));//-10 57 __m128i coeff2 = _mm_set1_epi16(*(short*)(coeff + 4));//19 -7 //__m128i coeff3 = _mm_set1_epi16(*(short*)(coeff + 6));//3 -1 for (row = 0; row < height; row++) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T10 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_subs_epi16(_mm_slli_epi16(T10, 2), _mm_cvtepu8_epi16(T60));//ԭ12Ϊ9 T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T10 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_subs_epi16(_mm_slli_epi16(T10, 2), _mm_cvtepu8_epi16(T60));//ԭ12Ϊ9 T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver1_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) {//-1, 4, -11, 40, 40, -11, 4, -1 const short offset = 32; const int shift = 6; int row, col; __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; __m128i coeff2 = _mm_set1_epi8(coeff[2]);//-11 __m128i coeff3 = _mm_set1_epi8(coeff[3]);//40 for (row = 0; row < height; row++) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T10 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_slli_epi16(T10, 2); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T10 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_slli_epi16(T10, 2); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T50), coeff2); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T30, T40), coeff3); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver2_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) {//-1, 3, -7, 19, 57, -10, 4, -1 const short offset = 32; const int shift = 6; int row, col; __m128i mAddOffset = _mm_set1_epi16(offset); pel_t const *p; __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); src -= 3 * i_src; //__m128i coeff0 = _mm_set1_epi16(*(short*)coeff); __m128i coeff1 = _mm_set1_epi16(*(short*)(coeff + 2)); __m128i coeff2 = _mm_set1_epi16(*(short*)(coeff + 4)); //__m128i coeff3 = _mm_set1_epi16(*(short*)(coeff + 6)); for (row = 0; row < height; row++) { __m128i mVal; p = src; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T60 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_subs_epi16(_mm_slli_epi16(T60, 2), _mm_cvtepu8_epi16(T10)); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_src)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_src)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_src)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_src)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_src)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_src)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_src)); T00 = _mm_adds_epi16(_mm_cvtepu8_epi16(T00), _mm_cvtepu8_epi16(T70)); T60 = _mm_adds_epi16(_mm_cvtepu8_epi16(T10), _mm_cvtepu8_epi16(T60)); T10 = _mm_subs_epi16(_mm_slli_epi16(T60, 2), _mm_cvtepu8_epi16(T10)); T20 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T20, T30), coeff1); T30 = _mm_maddubs_epi16(_mm_unpacklo_epi8(T40, T50), coeff2); mVal = _mm_add_epi16(_mm_sub_epi16(T10, T00), _mm_add_epi16(T20, T30)); mVal = _mm_srai_epi16(_mm_add_epi16(mVal, mAddOffset), shift); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; int row, col; int shift; int16_t const *p; int bsymy = (coef_y[1] == coef_y[6]); __m128i mAddOffset; __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m128i mSwitch2 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m128i mCoefx = _mm_set1_epi32(*(int*)coef_x); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); // HOR src = src - 1 * i_src - 1; if (width > 4) { for (row = -1; row < height + 2; row++) { __m128i mT0, mT1, mV01; for (col = 0; col < width; col += 8) { __m128i mSrc = _mm_loadu_si128((__m128i*)(src + col)); mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); mT1 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoefx); mV01 = _mm_hadd_epi16(mT0, mT1); _mm_store_si128((__m128i*)&tmp[col], mV01); } src += i_src; tmp += i_tmp; } } else { for (row = -1; row < height + 2; row++) { __m128i mSrc = _mm_loadu_si128((__m128i*)src); __m128i mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); __m128i mV01 = _mm_hadd_epi16(mT0, mT0); _mm_storel_epi64((__m128i*)tmp, mV01); src += i_src; tmp += i_tmp; } } // VER shift = 12; mAddOffset = _mm_set1_epi32(1 << 11); tmp = tmp_res; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coef_y[0]); __m128i mCoefy2 = _mm_set1_epi16(coef_y[1]); for (row = 0; row < height; row += 2) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T3); __m128i M01 = _mm_unpacklo_epi16(T1, T2); __m128i M02 = _mm_unpackhi_epi16(T0, T3); __m128i M03 = _mm_unpackhi_epi16(T1, T2); __m128i M10 = _mm_unpacklo_epi16(T1, T4); __m128i M11 = _mm_unpacklo_epi16(T2, T3); __m128i M12 = _mm_unpackhi_epi16(T1, T4); __m128i M13 = _mm_unpackhi_epi16(T2, T3); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, mCoefy1), _mm_madd_epi16(M01, mCoefy2)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, mCoefy1), _mm_madd_epi16(M03, mCoefy2)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, mCoefy1), _mm_madd_epi16(M11, mCoefy2)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, mCoefy1), _mm_madd_epi16(M13, mCoefy2)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_storel_epi64((__m128i*)&dst[col], mV01); _mm_storel_epi64((__m128i*)&dst[col + i_dst], mV11); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T3); __m128i M01 = _mm_unpacklo_epi16(T1, T2); __m128i M02 = _mm_unpackhi_epi16(T0, T3); __m128i M03 = _mm_unpackhi_epi16(T1, T2); __m128i M10 = _mm_unpacklo_epi16(T1, T4); __m128i M11 = _mm_unpacklo_epi16(T2, T3); __m128i M12 = _mm_unpackhi_epi16(T1, T4); __m128i M13 = _mm_unpackhi_epi16(T2, T3); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, mCoefy1), _mm_madd_epi16(M01, mCoefy2)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, mCoefy1), _mm_madd_epi16(M03, mCoefy2)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, mCoefy1), _mm_madd_epi16(M11, mCoefy2)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, mCoefy1), _mm_madd_epi16(M13, mCoefy2)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col]); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col + i_dst]); } tmp += i_tmp * 2; dst += i_dst * 2; } } else { __m128i coeff0 = _mm_set1_epi16(*(short*)coef_y); __m128i coeff1 = _mm_set1_epi16(*(short*)(coef_y + 2)); coeff0 = _mm_cvtepi8_epi16(coeff0); coeff1 = _mm_cvtepi8_epi16(coeff1); for (row = 0; row < height; row += 2) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T1); __m128i M01 = _mm_unpacklo_epi16(T2, T3); __m128i M02 = _mm_unpackhi_epi16(T0, T1); __m128i M03 = _mm_unpackhi_epi16(T2, T3); __m128i M10 = _mm_unpacklo_epi16(T1, T2); __m128i M11 = _mm_unpacklo_epi16(T3, T4); __m128i M12 = _mm_unpackhi_epi16(T1, T2); __m128i M13 = _mm_unpackhi_epi16(T3, T4); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, coeff0), _mm_madd_epi16(M01, coeff1)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, coeff0), _mm_madd_epi16(M03, coeff1)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, coeff0), _mm_madd_epi16(M11, coeff1)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, coeff0), _mm_madd_epi16(M13, coeff1)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_storel_epi64((__m128i*)&dst[col], mV01); _mm_storel_epi64((__m128i*)&dst[col + i_dst], mV11); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i mV01, mV02; __m128i mV11, mV12; __m128i T0 = _mm_loadu_si128((__m128i*)(p)); __m128i T1 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T2 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T3 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T4 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i M00 = _mm_unpacklo_epi16(T0, T1); __m128i M01 = _mm_unpacklo_epi16(T2, T3); __m128i M02 = _mm_unpackhi_epi16(T0, T1); __m128i M03 = _mm_unpackhi_epi16(T2, T3); __m128i M10 = _mm_unpacklo_epi16(T1, T2); __m128i M11 = _mm_unpacklo_epi16(T3, T4); __m128i M12 = _mm_unpackhi_epi16(T1, T2); __m128i M13 = _mm_unpackhi_epi16(T3, T4); mV01 = _mm_add_epi32(_mm_madd_epi16(M00, coeff0), _mm_madd_epi16(M01, coeff1)); mV02 = _mm_add_epi32(_mm_madd_epi16(M02, coeff0), _mm_madd_epi16(M03, coeff1)); mV11 = _mm_add_epi32(_mm_madd_epi16(M10, coeff0), _mm_madd_epi16(M11, coeff1)); mV12 = _mm_add_epi32(_mm_madd_epi16(M12, coeff0), _mm_madd_epi16(M13, coeff1)); mV01 = _mm_srai_epi32(_mm_add_epi32(mV01, mAddOffset), shift); mV02 = _mm_srai_epi32(_mm_add_epi32(mV02, mAddOffset), shift); mV11 = _mm_srai_epi32(_mm_add_epi32(mV11, mAddOffset), shift); mV12 = _mm_srai_epi32(_mm_add_epi32(mV12, mAddOffset), shift); mV01 = _mm_packs_epi32 (mV01, mV02); mV01 = _mm_packus_epi16(mV01, mV01); mV11 = _mm_packs_epi32 (mV11, mV12); mV11 = _mm_packus_epi16(mV11, mV11); _mm_maskmoveu_si128(mV01, mask, (char *)&dst[col]); _mm_maskmoveu_si128(mV11, mask, (char *)&dst[col + i_dst]); } tmp += i_tmp * 2; dst += i_dst * 2; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_sse128(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN16(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 64; int row, col; int shift = 12; int16_t const *p; int bsymy = (coef_y[1] == coef_y[6]); __m128i mAddOffset = _mm_set1_epi32(1 << (shift - 1)); __m128i mSwitch1 = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m128i mSwitch2 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m128i mSwitch3 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m128i mSwitch4 = _mm_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m128i mCoefx = _mm_loadl_epi64((__m128i*)coef_x); __m128i mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(width & 7) - 1])); mCoefx = _mm_unpacklo_epi64(mCoefx, mCoefx); // HOR src -= (3 * i_src + 3); for (row = -3; row < height + 4; row++) { for (col = 0; col < width; col += 8) { __m128i mSrc = _mm_loadu_si128((__m128i*)(src + col)); __m128i mT0 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch1), mCoefx); __m128i mT1 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch2), mCoefx); __m128i mT2 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch3), mCoefx); __m128i mT3 = _mm_maddubs_epi16(_mm_shuffle_epi8(mSrc, mSwitch4), mCoefx); __m128i mVal = _mm_hadd_epi16(_mm_hadd_epi16(mT0, mT1), _mm_hadd_epi16(mT2, mT3)); _mm_store_si128((__m128i*)&tmp[col], mVal); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m128i mCoefy1 = _mm_set1_epi16(coef_y[0]); __m128i mCoefy2 = _mm_set1_epi16(coef_y[1]); __m128i mCoefy3 = _mm_set1_epi16(coef_y[2]); __m128i mCoefy4 = _mm_set1_epi16(coef_y[3]); for (row = 0; row < height - 3; row += 4) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { // store either 1, 2, 3, 4, 5, 6, or 7 8-bit results in dst __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T70); __m128i T1 = _mm_unpacklo_epi16(T10, T60); __m128i T2 = _mm_unpacklo_epi16(T20, T50); __m128i T3 = _mm_unpacklo_epi16(T30, T40); __m128i T4 = _mm_unpackhi_epi16(T00, T70); __m128i T5 = _mm_unpackhi_epi16(T10, T60); __m128i T6 = _mm_unpackhi_epi16(T20, T50); __m128i T7 = _mm_unpackhi_epi16(T30, T40); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi16(T10, T80); T1 = _mm_unpacklo_epi16(T20, T70); T2 = _mm_unpacklo_epi16(T30, T60); T3 = _mm_unpacklo_epi16(T40, T50); T4 = _mm_unpackhi_epi16(T10, T80); T5 = _mm_unpackhi_epi16(T20, T70); T6 = _mm_unpackhi_epi16(T30, T60); T7 = _mm_unpackhi_epi16(T40, T50); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi16(T20, T90); T1 = _mm_unpacklo_epi16(T30, T80); T2 = _mm_unpacklo_epi16(T40, T70); T3 = _mm_unpacklo_epi16(T50, T60); T4 = _mm_unpackhi_epi16(T20, T90); T5 = _mm_unpackhi_epi16(T30, T80); T6 = _mm_unpackhi_epi16(T40, T70); T7 = _mm_unpackhi_epi16(T50, T60); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi16(T30, Ta0); T1 = _mm_unpacklo_epi16(T40, T90); T2 = _mm_unpacklo_epi16(T50, T80); T3 = _mm_unpacklo_epi16(T60, T70); T4 = _mm_unpackhi_epi16(T30, Ta0); T5 = _mm_unpackhi_epi16(T40, T90); T6 = _mm_unpackhi_epi16(T50, T80); T7 = _mm_unpackhi_epi16(T60, T70); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } tmp += 4 * i_tmp; dst += 4 * i_dst; } } else { __m128i mCoefy1 = _mm_set1_epi16(*(int16_t*)coef_y); __m128i mCoefy2 = _mm_set1_epi16(*(int16_t*)(coef_y + 2)); __m128i mCoefy3 = _mm_set1_epi16(*(int16_t*)(coef_y + 4)); __m128i mCoefy4 = _mm_set1_epi16(*(int16_t*)(coef_y + 6)); mCoefy1 = _mm_cvtepi8_epi16(mCoefy1); mCoefy2 = _mm_cvtepi8_epi16(mCoefy2); mCoefy3 = _mm_cvtepi8_epi16(mCoefy3); mCoefy4 = _mm_cvtepi8_epi16(mCoefy4); for (row = 0; row < height - 3; row += 4) { p = tmp; for (col = 0; col < width - 7; col += 8) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)&dst[col], mVal); T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + i_dst), mVal); T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 2 * i_dst), mVal); T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_storel_epi64((__m128i*)(&dst[col] + 3 * i_dst), mVal); p += 8; } if (col < width) { __m128i T00 = _mm_loadu_si128((__m128i*)(p)); __m128i T10 = _mm_loadu_si128((__m128i*)(p + i_tmp)); __m128i T20 = _mm_loadu_si128((__m128i*)(p + 2 * i_tmp)); __m128i T30 = _mm_loadu_si128((__m128i*)(p + 3 * i_tmp)); __m128i T40 = _mm_loadu_si128((__m128i*)(p + 4 * i_tmp)); __m128i T50 = _mm_loadu_si128((__m128i*)(p + 5 * i_tmp)); __m128i T60 = _mm_loadu_si128((__m128i*)(p + 6 * i_tmp)); __m128i T70 = _mm_loadu_si128((__m128i*)(p + 7 * i_tmp)); __m128i T80 = _mm_loadu_si128((__m128i*)(p + 8 * i_tmp)); __m128i T90 = _mm_loadu_si128((__m128i*)(p + 9 * i_tmp)); __m128i Ta0 = _mm_loadu_si128((__m128i*)(p + 10 * i_tmp)); __m128i T0 = _mm_unpacklo_epi16(T00, T10); __m128i T1 = _mm_unpacklo_epi16(T20, T30); __m128i T2 = _mm_unpacklo_epi16(T40, T50); __m128i T3 = _mm_unpacklo_epi16(T60, T70); __m128i T4 = _mm_unpackhi_epi16(T00, T10); __m128i T5 = _mm_unpackhi_epi16(T20, T30); __m128i T6 = _mm_unpackhi_epi16(T40, T50); __m128i T7 = _mm_unpackhi_epi16(T60, T70); __m128i mVal1, mVal2, mVal; T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)&dst[col]); T0 = _mm_unpacklo_epi16(T10, T20); T1 = _mm_unpacklo_epi16(T30, T40); T2 = _mm_unpacklo_epi16(T50, T60); T3 = _mm_unpacklo_epi16(T70, T80); T4 = _mm_unpackhi_epi16(T10, T20); T5 = _mm_unpackhi_epi16(T30, T40); T6 = _mm_unpackhi_epi16(T50, T60); T7 = _mm_unpackhi_epi16(T70, T80); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + i_dst)); T0 = _mm_unpacklo_epi16(T20, T30); T1 = _mm_unpacklo_epi16(T40, T50); T2 = _mm_unpacklo_epi16(T60, T70); T3 = _mm_unpacklo_epi16(T80, T90); T4 = _mm_unpackhi_epi16(T20, T30); T5 = _mm_unpackhi_epi16(T40, T50); T6 = _mm_unpackhi_epi16(T60, T70); T7 = _mm_unpackhi_epi16(T80, T90); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 2 * i_dst)); T0 = _mm_unpacklo_epi16(T30, T40); T1 = _mm_unpacklo_epi16(T50, T60); T2 = _mm_unpacklo_epi16(T70, T80); T3 = _mm_unpacklo_epi16(T90, Ta0); T4 = _mm_unpackhi_epi16(T30, T40); T5 = _mm_unpackhi_epi16(T50, T60); T6 = _mm_unpackhi_epi16(T70, T80); T7 = _mm_unpackhi_epi16(T90, Ta0); T0 = _mm_madd_epi16(T0, mCoefy1); T1 = _mm_madd_epi16(T1, mCoefy2); T2 = _mm_madd_epi16(T2, mCoefy3); T3 = _mm_madd_epi16(T3, mCoefy4); T4 = _mm_madd_epi16(T4, mCoefy1); T5 = _mm_madd_epi16(T5, mCoefy2); T6 = _mm_madd_epi16(T6, mCoefy3); T7 = _mm_madd_epi16(T7, mCoefy4); mVal1 = _mm_add_epi32(_mm_add_epi32(T0, T1), _mm_add_epi32(T2, T3)); mVal2 = _mm_add_epi32(_mm_add_epi32(T4, T5), _mm_add_epi32(T6, T7)); mVal1 = _mm_srai_epi32(_mm_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm_srai_epi32(_mm_add_epi32(mVal2, mAddOffset), shift); mVal = _mm_packs_epi32(mVal1, mVal2); mVal = _mm_packus_epi16(mVal, mVal); _mm_maskmoveu_si128(mVal, mask, (char *)(&dst[col] + 3 * i_dst)); } tmp += 4 * i_tmp; dst += 4 * i_dst; } } } #endif davs2-1.6/source/common/vec/intrinsic_inter_pred_avx2.cc000066400000000000000000003675111337322544400234540ustar00rootroot00000000000000/* * intrinsic_inter-pred_avx2.cc * * Description of this file: * AVX2 assembly functions of Inter-Prediction module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include #pragma warning(disable:4127) // warning C4127: ʽdz #if !HIGH_BIT_DEPTH /*--------------------------------------- ------------------------------------------------------*/ /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const int offset = 32; const int shift = 6; const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); const __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); const __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; src -= 3; #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coeff); #else mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); __m256i T0, T1, T2, T3; __m256i sum; T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T1 = _mm256_hadd_epi16(T2, T3); sum = _mm256_hadd_epi16(T0, T1); sum = _mm256_srai_epi16(_mm256_add_epi16(sum, mAddOffset), shift); sum = _mm256_packus_epi16(sum, sum); sum = _mm256_permute4x64_epi64(sum, 0xd8); _mm256_maskstore_epi32((int*)(dst + col), mask16, sum); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i index = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); const __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); const __m256i mSwitch3 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); const __m256i mSwitch4 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); const __m256i mSwitch5 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); const __m256i mSwitch6 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; UNUSED_PARAMETER(width); src -= 3; #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coeff); #else mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_permute4x64_epi64(S0, 0x99); __m256i T0, T1, T2, T3, T4, T5; __m256i sum1, sum2; T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T4 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch5), mCoef); T5 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch6), mCoef); T0 = _mm256_hadd_epi16(T0, T1); sum1 = _mm256_hadd_epi16(_mm256_hadd_epi16(T2, T3), _mm256_hadd_epi16(T4, T5)); sum2 = _mm256_hadd_epi16(T0, T0); sum1 = _mm256_srai_epi16(_mm256_add_epi16(sum1, mAddOffset), shift); sum2 = _mm256_srai_epi16(_mm256_add_epi16(sum2, mAddOffset), shift); sum2 = _mm256_permutevar8x32_epi32(sum2, index); sum1 = _mm256_packus_epi16(sum1, sum2); _mm256_maskstore_epi32((int*)(dst), mask24, sum1); src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w32_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w64_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst + 32), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; const int i_src8 = i_src * 8; __m256i mAddOffset = _mm256_set1_epi16((short)offset); src -= 3 * i_src; UNUSED_PARAMETER(width); if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); for (row = 0; row < height; row += 2) { __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); __m128i S5 = _mm_loadu_si128((__m128i*)(src + i_src5)); __m128i S6 = _mm_loadu_si128((__m128i*)(src + i_src6)); __m128i S7 = _mm_loadu_si128((__m128i*)(src + i_src7)); __m128i S8 = _mm_loadu_si128((__m128i*)(src + i_src8)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; __m256i R0, R1, R2, R3, R4, R5, R6, R7; R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); R4 = _mm256_set_m128i(S4, S5); R5 = _mm256_set_m128i(S5, S6); R6 = _mm256_set_m128i(S6, S7); R7 = _mm256_set_m128i(S7, S8); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R7), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R1, R6), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R1, R6), coeff1); T4 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R5), coeff2); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R3, R4), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R3, R4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T2), _mm256_add_epi16(T4, T6)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T1, T3), _mm256_add_epi16(T5, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)(coeff + 0)); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(int16_t*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(int16_t*)(coeff + 6)); for (row = 0; row < height; row += 2) { __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); __m128i S5 = _mm_loadu_si128((__m128i*)(src + i_src5)); __m128i S6 = _mm_loadu_si128((__m128i*)(src + i_src6)); __m128i S7 = _mm_loadu_si128((__m128i*)(src + i_src7)); __m128i S8 = _mm_loadu_si128((__m128i*)(src + i_src8)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; __m256i R0, R1, R2, R3, R4, R5, R6, R7; R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); R4 = _mm256_set_m128i(S4, S5); R5 = _mm256_set_m128i(S5, S6); R6 = _mm256_set_m128i(S6, S7); R7 = _mm256_set_m128i(S7, S8); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R1), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R3), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R3), coeff1); T4 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R4, R5), coeff2); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R4, R5), coeff2); T6 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R6, R7), coeff3); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R6, R7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T2), _mm256_add_epi16(T4, T6)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T1, T3), _mm256_add_epi16(T5, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[6]); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; __m256i mAddOffset = _mm256_set1_epi16((short)offset); UNUSED_PARAMETER(width); src -= 3 * i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_w48_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { const int shift = 6; const int offset = (1 << shift) >> 1; const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; const int i_src5 = i_src * 5; const int i_src6 = i_src * 6; const int i_src7 = i_src * 7; const __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); int bsym = (coeff[1] == coeff[6]); int row; src -= 3 * i_src; UNUSED_PARAMETER(width); if (bsym) { __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); __m256i coeff2 = _mm256_set1_epi8(coeff[2]); __m256i coeff3 = _mm256_set1_epi8(coeff[3]); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S7), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S6), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S3, S4), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S7), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S6), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S3, S4), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst + 32), mask16, mVal1); src += i_src; dst += i_dst; } } else { __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i coeff0 = _mm256_set1_epi16(*(short*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i coeff2 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i coeff3 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T0, T1, T2, T3, T4, T5, T6, T7, mVal1, mVal2; for (row = 0; row < height; row++) { const pel_t *p = src + 32; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); __m256i S4 = _mm256_loadu_si256((__m256i*)(src + i_src4)); __m256i S5 = _mm256_loadu_si256((__m256i*)(src + i_src5)); __m256i S6 = _mm256_loadu_si256((__m256i*)(src + i_src6)); __m256i S7 = _mm256_loadu_si256((__m256i*)(src + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); S0 = _mm256_loadu_si256((__m256i*)(p)); S1 = _mm256_loadu_si256((__m256i*)(p + i_src)); S2 = _mm256_loadu_si256((__m256i*)(p + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(p + i_src3)); S4 = _mm256_loadu_si256((__m256i*)(p + i_src4)); S5 = _mm256_loadu_si256((__m256i*)(p + i_src5)); S6 = _mm256_loadu_si256((__m256i*)(p + i_src6)); S7 = _mm256_loadu_si256((__m256i*)(p + i_src7)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S4, S5), coeff2); T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S6, S7), coeff3); T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S4, S5), coeff2); T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S6, S7), coeff3); mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst + 32), mask16, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_w16_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 64; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int i_tmp4 = 4 * i_tmp; const int i_tmp5 = 5 * i_tmp; const int i_tmp6 = 6 * i_tmp; const int i_tmp7 = 7 * i_tmp; const int shift = 12; const __m256i mAddOffset = _mm256_set1_epi32((1 << shift) >> 1); int row, col; __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, // ǰ 8 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); // 8 __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef; src = src - 3 * i_src - 3; //HOR #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coef_x); #else mCoef = _mm256_loadu_si256((__m256i*)coef_x); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = -3; row < height + 4; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, sum, T2, T3; __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); // ǰ8ֵصͺ8ֵĵֱ뵽ǰ128λ __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); sum = _mm256_hadd_epi16(_mm256_hadd_epi16(T0, T1), _mm256_hadd_epi16(T2, T3)); _mm256_store_si256((__m256i*)(tmp + col), sum); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); __m256i mCoefy3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 4))); __m256i mCoefy4 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 6))); // ͬʱֵ2/4Уظload for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i mVal1, mVal2; __m256i S0 = _mm256_load_si256((__m256i*)(tmp + col)); __m256i S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); __m256i S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); __m256i S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); __m256i S4 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp4)); __m256i S5 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp5)); __m256i S6 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp6)); __m256i S7 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal1 = _mm256_packs_epi32(mVal1, mVal2); mVal1 = _mm256_packus_epi16(mVal1, mVal1); mVal1 = _mm256_permute4x64_epi64(mVal1, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal1)); } tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_w24_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(64 + 7) * 64]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int i_tmp4 = 4 * i_tmp; const int i_tmp5 = 5 * i_tmp; const int i_tmp6 = 6 * i_tmp; const int i_tmp7 = 7 * i_tmp; int row; int bsymy = (coef_y[1] == coef_y[6]); int shift = 12; __m256i mAddOffset = _mm256_set1_epi32(1 << 11); __m256i mCoef; __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); // HOR __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mSwitch3 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch4 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch5 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch6 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); src -= (3 * i_src + 3); #if ARCH_X86_64 mCoef = _mm256_set1_epi64x(*(long long*)coef_x); #else mCoef = _mm256_loadu_si256((__m256i*)coef_x); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif for (row = -3; row < height + 4; row++) { __m256i T0, T1, T2, T3, T4, T5, sum1, sum2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_permute4x64_epi64(S0, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S1, mSwitch2), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch3), mCoef); T3 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch4), mCoef); T4 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch5), mCoef); T5 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch6), mCoef); sum1 = _mm256_hadd_epi16(_mm256_hadd_epi16(T2, T3), _mm256_hadd_epi16(T4, T5)); sum2 = _mm256_hadd_epi16(T0, T0); sum2 = _mm256_permute4x64_epi64(sum2, 0xd8); sum2 = _mm256_permute2x128_si256(sum1, sum2, 0x13); _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(sum1)); _mm256_storeu_si256((__m256i*)(tmp + 8), sum2); src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); __m256i mCoefy3 = _mm256_set1_epi16(coef_y[2]); __m256i mCoefy4 = _mm256_set1_epi16(coef_y[3]); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal, mVal3, mVal4; __m256i T0, T1, T2, T3, S0, S1, S2, S3; __m256i T4, T5, T6, T7, S4, S5, S6, S7; __m256i T00, T11, T22, T33, S00, S11, S22, S33; __m256i T44, T55, T66, T77, S44, S55, S66, S77; S0 = _mm256_loadu_si256((__m256i*)(tmp)); S1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp4)); S5 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp5)); S6 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp6)); S7 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp7)); S00 = _mm256_loadu_si256((__m256i*)(tmp + 16)); S11 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp)); S22 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp2)); S33 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp3)); S44 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp4)); S55 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp5)); S66 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp6)); S77 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S7), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S6), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S3, S4), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S7), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S6), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S3, S4), mCoefy4); T00 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S00, S77), mCoefy1); T11 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S11, S66), mCoefy2); T22 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S22, S55), mCoefy3); T33 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S33, S44), mCoefy4); T44 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S00, S77), mCoefy1); T55 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S11, S66), mCoefy2); T66 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S22, S55), mCoefy3); T77 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S33, S44), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal3 = _mm256_add_epi32(_mm256_add_epi32(T00, T11), _mm256_add_epi32(T22, T33)); mVal4 = _mm256_add_epi32(_mm256_add_epi32(T44, T55), _mm256_add_epi32(T66, T77)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(mVal3, mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(mVal4, mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y))); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); __m256i mCoefy3 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 4))); __m256i mCoefy4 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 6))); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal, mVal3, mVal4; __m256i T0, T1, T2, T3, S0, S1, S2, S3; __m256i T4, T5, T6, T7, S4, S5, S6, S7; __m256i T00, T11, T22, T33, S00, S11, S22, S33; __m256i T44, T55, T66, T77, S44, S55, S66, S77; S0 = _mm256_loadu_si256((__m256i*)(tmp)); S1 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp4)); S5 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp5)); S6 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp6)); S7 = _mm256_loadu_si256((__m256i*)(tmp + i_tmp7)); S00 = _mm256_loadu_si256((__m256i*)(tmp + 16)); S11 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp)); S22 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp2)); S33 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp3)); S44 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp4)); S55 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp5)); S66 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp6)); S77 = _mm256_loadu_si256((__m256i*)(tmp + 16 + i_tmp7)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy3); T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy4); T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy3); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy4); T00 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S00, S11), mCoefy1); T11 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S22, S33), mCoefy2); T22 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S44, S55), mCoefy3); T33 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S66, S77), mCoefy4); T44 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S00, S11), mCoefy1); T55 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S22, S33), mCoefy2); T66 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S44, S55), mCoefy3); T77 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S66, S77), mCoefy4); mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); mVal3 = _mm256_add_epi32(_mm256_add_epi32(T00, T11), _mm256_add_epi32(T22, T33)); mVal4 = _mm256_add_epi32(_mm256_add_epi32(T44, T55), _mm256_add_epi32(T66, T77)); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(mVal3, mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(mVal4, mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coeff) { int row, col; const int offset = 32; const int shift = 6; __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coeff); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i mask16 = _mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0); src -= 1; for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, sum; __m256i S = _mm256_loadu_si256((__m256i*)(src + col)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); sum = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T0, T1), mAddOffset), shift); sum = _mm256_packus_epi16(sum, sum); sum = _mm256_permute4x64_epi64(sum, 0xd8); _mm256_maskstore_epi32((int*)(dst + col), mask16, sum); } src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { const int offset = 32; const int shift = 6; const __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coeff); const __m256i mSwitch = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); const __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); const __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const __m256i mAddOffset = _mm256_set1_epi16((short)offset); const __m256i index = _mm256_setr_epi32(0, 1, 2, 6, 4, 5, 3, 7); int row; src -= 1; for (row = 0; row < height; row++) { __m256i T0, T1, T2, sum1, sum2; __m256i S = _mm256_loadu_si256((__m256i*)(src)); __m256i S0 = _mm256_permute4x64_epi64(S, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef); sum1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T0, T1), mAddOffset), shift); sum2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_hadd_epi16(T2, T2), mAddOffset), shift); sum1 = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(sum1, sum2), index); _mm256_maskstore_epi32((int*)(dst), mask24, sum1); src += i_src; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w32_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row++) { __m256i S0, S1, S2, S3; __m256i T0, T1, T2, T3, mVal1, mVal2; S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S2), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S3), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu_si256((__m256i*)(dst), mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S1, S2), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S3), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S1, S2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row++) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + i_src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + i_src2)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + i_src3)); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S0, S1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(S2, S3), coeff1); T2 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S0, S1), coeff0); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(S2, S3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T2, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_maskstore_epi32((int*)(dst), mask24, mVal1); src += i_src; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int height, const int8_t *coeff) { int row; const int offset = 32; const int shift = 6; int bsym = (coeff[1] == coeff[2]); __m256i mAddOffset = _mm256_set1_epi16((short)offset); const int i_src2 = i_src * 2; const int i_src3 = i_src * 3; const int i_src4 = i_src * 4; src -= i_src; if (bsym) { __m256i coeff0 = _mm256_set1_epi8(coeff[0]); __m256i coeff1 = _mm256_set1_epi8(coeff[1]); for (row = 0; row < height; row = row + 2) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i R0, R1, R2, R3; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R3), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R3), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R1, R2), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R1, R2), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T2), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T1, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } else { __m256i coeff0 = _mm256_set1_epi16(*(int16_t*)coeff); __m256i coeff1 = _mm256_set1_epi16(*(int16_t*)(coeff + 2)); for (row = 0; row < height; row = row + 2) { __m256i T0, T1, T2, T3, mVal1, mVal2; __m256i R0, R1, R2, R3; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + i_src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + i_src2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + i_src3)); __m128i S4 = _mm_loadu_si128((__m128i*)(src + i_src4)); R0 = _mm256_set_m128i(S0, S1); R1 = _mm256_set_m128i(S1, S2); R2 = _mm256_set_m128i(S2, S3); R3 = _mm256_set_m128i(S3, S4); T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R0, R1), coeff0); T1 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R0, R1), coeff0); T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(R2, R3), coeff1); T3 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(R2, R3), coeff1); mVal1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T0, T2), mAddOffset), shift); mVal2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_add_epi16(T1, T3), mAddOffset), shift); mVal1 = _mm256_packus_epi16(mVal1, mVal2); _mm256_storeu2_m128i((__m128i*)dst, (__m128i*)(dst + i_dst), mVal1); src += 2 * i_src; dst += 2 * i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_w16_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; const int shift = 12; int row, col; int bsymy = (coef_y[1] == coef_y[6]); __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coef_x); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); // HOR src -= (i_src + 1); for (row = -1; row < height + 2; row++) { for (col = 0; col < width; col += 16) { __m256i T0, T1, S, S0, sum; S = _mm256_loadu_si256((__m256i*)(src + col)); S0 = _mm256_permute4x64_epi64(S, 0x94); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch2), mCoef); sum = _mm256_hadd_epi16(T0, T1); _mm256_storeu_si256((__m256i*)(tmp + col), sum); } src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, S0, S1, S2, S3; S0 = _mm256_load_si256((__m256i*)(tmp + col)); S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S3), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S2), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S3), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S2), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), /*no-use*/mVal1); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal)); } tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); for (row = 0; row < height; row++) { for (col = 0; col < width; col += 16) { __m256i mVal1, mVal2, mVal; __m256i T0, T1, T2, T3, S0, S1, S2, S3; S0 = _mm256_load_si256((__m256i*)(tmp + col)); S1 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + col + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), /*no-use*/mVal1); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm_storeu_si128((__m128i*)(dst + col), _mm256_castsi256_si128(mVal)); } tmp += i_tmp; dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_w24_avx2(pel_t *dst, int i_dst, const pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { ALIGN32(int16_t tmp_res[(32 + 3) * 32]); int16_t *tmp = tmp_res; const int i_tmp = 32; const int i_tmp2 = 2 * i_tmp; const int i_tmp3 = 3 * i_tmp; int row; int bsymy = (coef_y[1] == coef_y[6]); const int shift = 12; __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoef = _mm256_set1_epi32(*(int32_t*)coef_x); __m256i mSwitch = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); __m256i mSwitch2 = _mm256_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); __m256i mask24 = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0); //HOR src = src - i_src - 1; UNUSED_PARAMETER(width); for (row = -1; row < height + 2; row++) { __m256i T0, T1, T2, S, S0; S = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S, 0x99); T0 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch1), mCoef); T1 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S, mSwitch2), mCoef); T2 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(S0, mSwitch), mCoef); T0 = _mm256_hadd_epi16(T0, T1); T2 = _mm256_hadd_epi16(T2, T2); T2 = _mm256_permute4x64_epi64(T2, 0xd8); T2 = _mm256_permute2x128_si256(T0, T2, 0x13); _mm_storeu_si128((__m128i*)(tmp), _mm256_castsi256_si128(T0)); _mm256_storeu_si256((__m256i*)(tmp + 8), T2); src += i_src; tmp += i_tmp; } // VER tmp = tmp_res; if (bsymy) { __m256i mCoefy1 = _mm256_set1_epi16(coef_y[0]); __m256i mCoefy2 = _mm256_set1_epi16(coef_y[1]); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal3, mVal4, mVal; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i T0, T1, T2, T3, T4, T5, T6, T7; S0 = _mm256_load_si256((__m256i*)(tmp)); S1 = _mm256_load_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_load_si256((__m256i*)(tmp + 16)); S5 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp)); S6 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp2)); S7 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S3), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S1, S2), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S3), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S1, S2), mCoefy2); T4 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S7), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S5, S6), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S7), mCoefy1); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S5, S6), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T4, T5), mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T6, T7), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } else { __m256i mCoefy1 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)coef_y)); __m256i mCoefy2 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coef_y + 2))); for (row = 0; row < height; row++) { __m256i mVal1, mVal2, mVal3, mVal4, mVal; __m256i S0, S1, S2, S3, S4, S5, S6, S7; __m256i T0, T1, T2, T3, T4, T5, T6, T7; S0 = _mm256_load_si256((__m256i*)(tmp)); S1 = _mm256_load_si256((__m256i*)(tmp + i_tmp)); S2 = _mm256_load_si256((__m256i*)(tmp + i_tmp2)); S3 = _mm256_load_si256((__m256i*)(tmp + i_tmp3)); S4 = _mm256_load_si256((__m256i*)(tmp + 16)); S5 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp)); S6 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp2)); S7 = _mm256_load_si256((__m256i*)(tmp + 16 + i_tmp3)); T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S0, S1), mCoefy1); T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S2, S3), mCoefy2); T2 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S0, S1), mCoefy1); T3 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S2, S3), mCoefy2); T4 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S4, S5), mCoefy1); T5 = _mm256_madd_epi16(_mm256_unpacklo_epi16(S6, S7), mCoefy2); T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S4, S5), mCoefy1); T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(S6, S7), mCoefy2); mVal1 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T0, T1), mAddOffset), shift); mVal2 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T2, T3), mAddOffset), shift); mVal3 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T4, T5), mAddOffset), shift); mVal4 = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(T6, T7), mAddOffset), shift); mVal = _mm256_packus_epi16(_mm256_packs_epi32(mVal1, mVal2), _mm256_packs_epi32(mVal3, mVal4)); mVal = _mm256_permute4x64_epi64(mVal, 0xd8); _mm256_maskstore_epi32((int*)(dst), mask24, mVal); tmp += i_tmp; dst += i_dst; } } } /*--------------------------------------- ֵ ------------------------------------------------------*/ /* --------------------------------------------------------------------------- */ void intpl_luma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: case 7: case 11: case 15: intpl_luma_block_hor_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_hor_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_hor_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: intpl_luma_block_ver_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_ver_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 7: intpl_luma_block_ver_w32_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_luma_block_ver_w48_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 15: intpl_luma_block_ver_w64_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver0_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: intpl_luma_block_ver_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_ver_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 7: intpl_luma_block_ver_w32_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_luma_block_ver_w48_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 15: intpl_luma_block_ver_w64_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver1_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: intpl_luma_block_ver_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_ver_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 7: intpl_luma_block_ver_w32_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_luma_block_ver_w48_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 15: intpl_luma_block_ver_w64_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ver2_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 4 - 1) { case 3: intpl_luma_block_ver_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 5: intpl_luma_block_ver_w24_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 7: intpl_luma_block_ver_w32_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_luma_block_ver_w48_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 15: intpl_luma_block_ver_w64_avx2(dst, i_dst, src, i_src, width, height, coeff); break; default: intpl_luma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_luma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { switch (width / 4 - 1) { case 3: case 7: case 11: case 15: intpl_luma_block_ext_w16_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; case 5: intpl_luma_block_ext_w24_avx2(dst, i_dst, src, i_src, height, coef_x, coef_y); break; default: intpl_luma_block_ext_sse128(dst, i_dst, src, i_src, width, height, coef_x, coef_y); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_hor_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 2 - 1) { case 7: case 15: intpl_chroma_block_hor_w16_avx2(dst, i_dst, src, i_src, width, height, coeff); break; case 11: intpl_chroma_block_hor_w24_avx2(dst, i_dst, src, i_src, height, coeff); break; default: intpl_chroma_block_hor_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coeff) { switch (width / 2 - 1) { case 7: intpl_chroma_block_ver_w16_avx2(dst, i_dst, src, i_src, height, coeff); break; case 11: intpl_chroma_block_ver_w24_avx2(dst, i_dst, src, i_src, height, coeff); break; case 15: intpl_chroma_block_ver_w32_avx2(dst, i_dst, src, i_src, height, coeff); break; default: intpl_chroma_block_ver_sse128(dst, i_dst, src, i_src, width, height, coeff); } } /* --------------------------------------------------------------------------- */ void intpl_chroma_block_ext_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, const int8_t *coef_x, const int8_t *coef_y) { switch (width / 2 - 1) { case 7: case 15: intpl_chroma_block_ext_w16_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; case 11: intpl_chroma_block_ext_w24_avx2(dst, i_dst, src, i_src, width, height, coef_x, coef_y); break; default: intpl_chroma_block_ext_sse128(dst, i_dst, src, i_src, width, height, coef_x, coef_y); } } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_EXT_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W0, W1), mCoefy01); \ T1 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W2, W3), mCoefy23); \ T2 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W4, W5), mCoefy45); \ T3 = _mm256_madd_epi16(_mm256_unpacklo_epi16(W6, W7), mCoefy67); \ T4 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W0, W1), mCoefy01); \ T5 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W2, W3), mCoefy23); \ T6 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W4, W5), mCoefy45); \ T7 = _mm256_madd_epi16(_mm256_unpackhi_epi16(W6, W7), mCoefy67); \ \ mVal1 = _mm256_add_epi32(_mm256_add_epi32(T0, T1), _mm256_add_epi32(T2, T3)); \ mVal2 = _mm256_add_epi32(_mm256_add_epi32(T4, T5), _mm256_add_epi32(T6, T7)); \ \ mVal1 = _mm256_srai_epi32(_mm256_add_epi32(mVal1, mAddOffset), shift); \ mVal2 = _mm256_srai_epi32(_mm256_add_epi32(mVal2, mAddOffset), shift); \ result = _mm256_packs_epi32(mVal1, mVal2); #define INTPL_LUMA_EXT_STORE(a, b, c) \ mVal = _mm256_permute4x64_epi64(_mm256_packus_epi16(a, b), 216); \ _mm256_storeu_si256((__m256i*)(c), mVal); /* --------------------------------------------------------------------------- */ void intpl_luma_ext_avx2(pel_t *dst, int i_dst, int16_t *tmp, int i_tmp, int width, int height, const int8_t *coeff) { const int shift = 12; int row, col; int16_t const *p; __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoefy01 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 0))); __m256i mCoefy23 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 2))); __m256i mCoefy45 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 4))); __m256i mCoefy67 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff + 6))); tmp -= 3 * i_tmp; for (row = 0; row < height; row = row + 4) { __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i U0, U1, U2, U3; __m256i V0, V1, V2, V3; __m256i mVal1, mVal2, mVal; p = tmp; for (col = 0; col < width - 31; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; } if (col < width - 16) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; col += 32; } if (col < width) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_EXT_STORE(U0, U0, dst + col); INTPL_LUMA_EXT_STORE(U1, U1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, U2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, U3, dst + 3 * i_dst + col); p += 16; col += 16; } tmp += i_tmp * 4; dst += i_dst * 4; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ext_x3_avx2(pel_t *const dst[3], int i_dst, int16_t *tmp, int i_tmp, int width, int height, const int8_t **coeff) { #if 1 intpl_luma_ext_avx2(dst[0], i_dst, tmp, i_tmp, width, height, coeff[0]); intpl_luma_ext_avx2(dst[1], i_dst, tmp, i_tmp, width, height, coeff[1]); intpl_luma_ext_avx2(dst[2], i_dst, tmp, i_tmp, width, height, coeff[2]); #else const int shift = 12; int row, col; int16_t const *p; __m256i mAddOffset = _mm256_set1_epi32(1 << (shift - 1)); __m256i mCoefy01 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff[0] + 0))); __m256i mCoefy23 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff[0] + 2))); __m256i mCoefy45 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff[0] + 4))); __m256i mCoefy67 = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)(coeff[0] + 6))); tmp -= 3 * i_tmp; for (row = 0; row < height; row = row + 4) { __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i U0, U1, U2, U3; __m256i V0, V1, V2, V3; __m256i mVal1, mVal2, mVal; p = tmp; for (col = 0; col < width - 31; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; } if (col < width - 16) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); //col + 16 T00 = _mm256_loadu_si256((__m256i*)(p + 16)); T10 = _mm256_loadu_si256((__m256i*)(p + 16 + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 16 + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 16 + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 16 + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 16 + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 16 + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 16 + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 16 + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 16 + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 16 + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, V0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, V1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, V2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, V3); INTPL_LUMA_EXT_STORE(U0, V0, dst + col); INTPL_LUMA_EXT_STORE(U1, V1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, V2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, V3, dst + 3 * i_dst + col); p += 32; col += 32; } if (col < width) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_tmp)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_tmp)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_tmp)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_tmp)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_tmp)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_tmp)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_tmp)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_tmp)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_tmp)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_tmp)); INTPL_LUMA_EXT_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_EXT_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_EXT_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_EXT_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_EXT_STORE(U0, U0, dst + col); INTPL_LUMA_EXT_STORE(U1, U1, dst + i_dst + col); INTPL_LUMA_EXT_STORE(U2, U2, dst + 2 * i_dst + col); INTPL_LUMA_EXT_STORE(U3, U3, dst + 3 * i_dst + col); p += 16; col += 16; } tmp += i_tmp * 4; dst += i_dst * 4; } #endif } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_avx2(pel_t *dst, int i_dst, int16_t *tmp, int i_tmp, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); #if ARCH_X86_64 __m256i mCoef = _mm256_set1_epi64x(*(long long *)coeff); #else __m256i mCoef = _mm256_loadu_si256((__m256i*)coeff); mCoef = _mm256_permute4x64_epi64(mCoef, 0x0); #endif src -= 3; for (row = 0; row < height; row++) { __m256i srcCoeff1, srcCoeff2; __m256i T20, T40, T60, T80; __m256i sum10, sum20; for (col = 0; col < width - 16; col += 32) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff2, mSwitch4), mCoef); sum20 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 16bit _mm256_storeu_si256((__m256i*)&tmp[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mAddOffset), shift); _mm256_storeu_si256((__m256i*)&dst[col], _mm256_packus_epi16(sum10, sum20)); } // width 16 if (col < width - 8) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); srcCoeff1 = _mm256_permute2x128_si256(srcCoeff1, srcCoeff2, 32); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 16bit _mm256_storeu_si256((__m256i*)&tmp[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); _mm256_storeu_si256((__m256i*)&dst[col], sum10); col += 16; } // width 8 if (col < width) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); T20 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch1), mCoef); T40 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch2), mCoef); T60 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch3), mCoef); T80 = _mm256_maddubs_epi16(_mm256_shuffle_epi8(srcCoeff1, mSwitch4), mCoef); sum10 = _mm256_hadd_epi16(_mm256_hadd_epi16(T20, T40), _mm256_hadd_epi16(T60, T80)); // store 16bit _mm256_storeu_si256((__m256i*)&tmp[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mAddOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_storeu_si256((__m256i*)&dst[col], sum10); } src += i_src; tmp += i_tmp; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_hor_x3_avx2(pel_t *const dst[3], int i_dst, mct_t *const tmp[3], int i_tmp, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { int row, col = 0; const short offset = 32; const int shift = 6; __m256i mOffset = _mm256_set1_epi16(offset); __m256i mSwitch1 = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8); __m256i mSwitch2 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10, 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10); __m256i mSwitch3 = _mm256_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12); __m256i mSwitch4 = _mm256_setr_epi8(6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14, 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14); __m256i mCoef0, mCoef1, mCoef2; mct_t *tmp0 = tmp[0]; mct_t *tmp1 = tmp[1]; mct_t *tmp2 = tmp[2]; pel_t *dst0 = dst[0]; pel_t *dst1 = dst[1]; pel_t *dst2 = dst[2]; #if ARCH_X86_64 mCoef0 = _mm256_set1_epi64x(*(long long *)coeff[0]); mCoef1 = _mm256_set1_epi64x(*(long long *)coeff[1]); mCoef2 = _mm256_set1_epi64x(*(long long *)coeff[2]); #else mCoef0 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[0]), 0x0); mCoef1 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[1]), 0x0); mCoef2 = _mm256_permute4x64_epi64(_mm256_loadu_si256((__m256i*)coeff[2]), 0x0); #endif src -= 3; for (row = 0; row < height; row++) { __m256i srcCoeff1, srcCoeff2; __m256i S11, S12, S13, S14; __m256i S21, S22, S23, S24; __m256i sum10, sum20; for (col = 0; col < width - 16; col += 32) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); S21 = _mm256_shuffle_epi8(srcCoeff2, mSwitch1); S22 = _mm256_shuffle_epi8(srcCoeff2, mSwitch2); S23 = _mm256_shuffle_epi8(srcCoeff2, mSwitch3); S24 = _mm256_shuffle_epi8(srcCoeff2, mSwitch4); #define INTPL_HOR_FLT(Coef, S1, S2, S3, S4, Res) do { \ __m256i T0 = _mm256_maddubs_epi16(S1, Coef); \ __m256i T1 = _mm256_maddubs_epi16(S2, Coef); \ __m256i T2 = _mm256_maddubs_epi16(S3, Coef); \ __m256i T3 = _mm256_maddubs_epi16(S4, Coef); \ Res = _mm256_hadd_epi16(_mm256_hadd_epi16(T0, T1), _mm256_hadd_epi16(T2, T3)); \ } while (0) /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef0, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp0[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp0[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst0[col], _mm256_packus_epi16(sum10, sum20)); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef1, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp1[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp1[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst1[col], _mm256_packus_epi16(sum10, sum20)); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); INTPL_HOR_FLT(mCoef2, S21, S22, S23, S24, sum20); // store 16bit _mm256_storeu_si256((__m256i*)&tmp2[col], _mm256_permute2x128_si256(sum10, sum20, 32)); _mm256_storeu_si256((__m256i*)&tmp2[col + 16], _mm256_permute2x128_si256(sum10, sum20, 49)); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum20 = _mm256_srai_epi16(_mm256_add_epi16(sum20, mOffset), shift); _mm256_storeu_si256((__m256i*)&dst2[col], _mm256_packus_epi16(sum10, sum20)); } // width 16 if (col < width - 8) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); srcCoeff2 = _mm256_loadu_si256((__m256i*)(src + col + 8)); srcCoeff1 = _mm256_permute2x128_si256(srcCoeff1, srcCoeff2, 32); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp0[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); _mm256_storeu_si256((__m256i*)&dst0[col], sum10); /* 1st */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp1[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); _mm256_storeu_si256((__m256i*)&dst1[col], sum10); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp2[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_permute4x64_epi64(_mm256_packus_epi16(sum10, sum10), 8); _mm256_storeu_si256((__m256i*)&dst2[col], sum10); col += 16; } // width 8 if (col < width) { srcCoeff1 = _mm256_loadu_si256((__m256i*)(src + col)); S11 = _mm256_shuffle_epi8(srcCoeff1, mSwitch1); S12 = _mm256_shuffle_epi8(srcCoeff1, mSwitch2); S13 = _mm256_shuffle_epi8(srcCoeff1, mSwitch3); S14 = _mm256_shuffle_epi8(srcCoeff1, mSwitch4); /* 1st */ INTPL_HOR_FLT(mCoef0, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp0[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_storeu_si256((__m256i*)&dst0[col], sum10); /* 2nd */ INTPL_HOR_FLT(mCoef1, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp1[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_storeu_si256((__m256i*)&dst1[col], sum10); /* 3rd */ INTPL_HOR_FLT(mCoef2, S11, S12, S13, S14, sum10); // store 16bit _mm256_storeu_si256((__m256i*)&tmp2[col], sum10); // store 8bit sum10 = _mm256_srai_epi16(_mm256_add_epi16(sum10, mOffset), shift); sum10 = _mm256_packus_epi16(sum10, sum10); _mm256_storeu_si256((__m256i*)&dst2[col], sum10); } src += i_src; tmp0 += i_tmp; tmp1 += i_tmp; tmp2 += i_tmp; dst0 += i_dst; dst1 += i_dst; dst2 += i_dst; } #undef INTPL_HOR_FLT } /* --------------------------------------------------------------------------- */ #define INTPL_LUMA_VER_COMPUT(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W0, W1), mCoefy01); \ T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W2, W3), mCoefy23); \ T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W4, W5), mCoefy45); \ T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W6, W7), mCoefy67); \ T4 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(W0, W1), mCoefy01); \ T5 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(W2, W3), mCoefy23); \ T6 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(W4, W5), mCoefy45); \ T7 = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(W6, W7), mCoefy67); \ \ mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); \ mVal2 = _mm256_add_epi16(_mm256_add_epi16(T4, T5), _mm256_add_epi16(T6, T7)); \ \ mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); \ mVal2 = _mm256_srai_epi16(_mm256_add_epi16(mVal2, mAddOffset), shift); \ result = _mm256_packus_epi16(mVal1, mVal2); #define INTPL_LUMA_VER_STORE(a, b) \ _mm256_storeu_si256((__m256i*)(b), a); #define INTPL_LUMA_VER_COMPUT_LOW(W0,W1,W2,W3,W4,W5,W6,W7,result) \ T0 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W0, W1), mCoefy01); \ T1 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W2, W3), mCoefy23); \ T2 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W4, W5), mCoefy45); \ T3 = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(W6, W7), mCoefy67); \ \ mVal1 = _mm256_add_epi16(_mm256_add_epi16(T0, T1), _mm256_add_epi16(T2, T3)); \ \ mVal1 = _mm256_srai_epi16(_mm256_add_epi16(mVal1, mAddOffset), shift); \ result = _mm256_packus_epi16(mVal1, mVal1); /* --------------------------------------------------------------------------- */ void intpl_luma_ver_avx2(pel_t *dst, int i_dst, pel_t *src, int i_src, int width, int height, int8_t const *coeff) { int row, col; const short offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); pel_t const *p; src -= 3 * i_src; __m256i mVal1, mVal2; __m256i mCoefy01 = _mm256_set1_epi16(*(short*)coeff); __m256i mCoefy23 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i mCoefy45 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i mCoefy67 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i U0, U1, U2, U3; for (row = 0; row < height; row = row + 4) { p = src; for (col = 0; col < width - 8; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_src)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_src)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_src)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_src)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_src)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_src)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_src)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_src)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_src)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_src)); INTPL_LUMA_VER_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_VER_STORE(U0, dst + col); INTPL_LUMA_VER_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_VER_STORE(U1, dst + i_dst + col); INTPL_LUMA_VER_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_VER_STORE(U2, dst + 2 * i_dst + col); INTPL_LUMA_VER_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_VER_STORE(U3, dst + 3 * i_dst + col); p += 32; } if (col < width) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_src)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_src)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_src)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_src)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_src)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_src)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_src)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_src)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_src)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_src)); INTPL_LUMA_VER_COMPUT_LOW(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_VER_STORE(U0, dst + col); INTPL_LUMA_VER_COMPUT_LOW(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_VER_STORE(U1, dst + i_dst + col); INTPL_LUMA_VER_COMPUT_LOW(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_VER_STORE(U2, dst + 2 * i_dst + col); INTPL_LUMA_VER_COMPUT_LOW(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_VER_STORE(U3, dst + 3 * i_dst + col); } src += 4 * i_src; dst += 4 * i_dst; } } /* --------------------------------------------------------------------------- */ void intpl_luma_ver_x3_avx2(pel_t *const dst[3], int i_dst, pel_t *src, int i_src, int width, int height, const int8_t **coeff) { #if 1 intpl_luma_ver_avx2(dst[0], i_dst, src, i_src, width, height, coeff[0]); intpl_luma_ver_avx2(dst[1], i_dst, src, i_src, width, height, coeff[1]); intpl_luma_ver_avx2(dst[2], i_dst, src, i_src, width, height, coeff[2]); #else int row, col; const short offset = 32; const int shift = 6; __m256i mAddOffset = _mm256_set1_epi16(offset); pel_t const *p; src -= 3 * i_src; __m256i mVal1, mVal2; __m256i mCoefy01 = _mm256_set1_epi16(*(short*)coeff); __m256i mCoefy23 = _mm256_set1_epi16(*(short*)(coeff + 2)); __m256i mCoefy45 = _mm256_set1_epi16(*(short*)(coeff + 4)); __m256i mCoefy67 = _mm256_set1_epi16(*(short*)(coeff + 6)); __m256i T00, T10, T20, T30, T40, T50, T60, T70, T80, T90, Ta0; __m256i T0, T1, T2, T3, T4, T5, T6, T7; __m256i U0, U1, U2, U3; for (row = 0; row < height; row = row + 4) { p = src; for (col = 0; col < width - 8; col += 32) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_src)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_src)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_src)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_src)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_src)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_src)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_src)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_src)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_src)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_src)); INTPL_LUMA_VER_COMPUT(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_VER_STORE(U0, dst + col); INTPL_LUMA_VER_COMPUT(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_VER_STORE(U1, dst + i_dst + col); INTPL_LUMA_VER_COMPUT(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_VER_STORE(U2, dst + 2 * i_dst + col); INTPL_LUMA_VER_COMPUT(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_VER_STORE(U3, dst + 3 * i_dst + col); p += 32; } if (col < width) { T00 = _mm256_loadu_si256((__m256i*)(p)); T10 = _mm256_loadu_si256((__m256i*)(p + i_src)); T20 = _mm256_loadu_si256((__m256i*)(p + 2 * i_src)); T30 = _mm256_loadu_si256((__m256i*)(p + 3 * i_src)); T40 = _mm256_loadu_si256((__m256i*)(p + 4 * i_src)); T50 = _mm256_loadu_si256((__m256i*)(p + 5 * i_src)); T60 = _mm256_loadu_si256((__m256i*)(p + 6 * i_src)); T70 = _mm256_loadu_si256((__m256i*)(p + 7 * i_src)); T80 = _mm256_loadu_si256((__m256i*)(p + 8 * i_src)); T90 = _mm256_loadu_si256((__m256i*)(p + 9 * i_src)); Ta0 = _mm256_loadu_si256((__m256i*)(p + 10 * i_src)); INTPL_LUMA_VER_COMPUT_LOW(T00, T10, T20, T30, T40, T50, T60, T70, U0); INTPL_LUMA_VER_STORE(U0, dst + col); INTPL_LUMA_VER_COMPUT_LOW(T10, T20, T30, T40, T50, T60, T70, T80, U1); INTPL_LUMA_VER_STORE(U1, dst + i_dst + col); INTPL_LUMA_VER_COMPUT_LOW(T20, T30, T40, T50, T60, T70, T80, T90, U2); INTPL_LUMA_VER_STORE(U2, dst + 2 * i_dst + col); INTPL_LUMA_VER_COMPUT_LOW(T30, T40, T50, T60, T70, T80, T90, Ta0, U3); INTPL_LUMA_VER_STORE(U3, dst + 3 * i_dst + col); } src += 4 * i_src; dst += 4 * i_dst; } #endif } #endif davs2-1.6/source/common/vec/intrinsic_intra-filledge.cc000066400000000000000000000450351337322544400232410ustar00rootroot00000000000000/* * intrinsic_intra-fiiledge.cc * * Description of this file: * SSE assembly functions of Intra-Filledge module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_0_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; UNUSED_PARAMETER(pTL); UNUSED_PARAMETER(i_TL); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[1]); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pLcuEP + i + 1)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[bsx + 1]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pLcuEP[bsx + i + 1])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_x_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pL = pTL + i_TL; /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], &pLcuEP[1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[1]); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pLcuEP + i + 1)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pLcuEP[bsx + 1], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pLcuEP[bsx + 1]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pLcuEP[bsx + i + 1])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { const pel_t *p_l = pL + bsy * i_TL; int y; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pLcuEP[1]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_y_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pT = pTL + 1; UNUSED_PARAMETER(i_TL); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)pT); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pT + i)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pT[bsx]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pT[bsx + i])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { /* fill left pixels */ memcpy(&EP[-bsy], &pLcuEP[-bsy], bsy * sizeof(pel_t)); } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { memcpy(&EP[-2 * bsy], &pLcuEP[-2 * bsy], bsy * sizeof(pel_t)); } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pLcuEP[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pLcuEP[-1]; } } /* --------------------------------------------------------------------------- * fill reference samples for intra prediction * LCU߽ϵPU */ void fill_edge_samples_xy_sse128(const pel_t *pTL, int i_TL, const pel_t *pLcuEP, pel_t *EP, uint32_t i_avai, int bsx, int bsy) { __m128i T0, T1; int i, k, j; int num_padding; const pel_t *pT = pTL + 1; const pel_t *pL = pTL + i_TL; UNUSED_PARAMETER(pLcuEP); /* fill default value */ k = ((bsy + bsx) << 1) + 1; j = (k >> 4) << 4; T0 = _mm_set1_epi8((uint8_t)g_dc_value); for (i = 0; i < j; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } memset(&EP[-(bsy << 1)] + j, g_dc_value, k - j + 1); EP[2 * bsx] = (pel_t)g_dc_value; /* get prediction pixels --------------------------------------- * extra pixels | left-down pixels | left pixels | top-left | top pixels | top-right pixels | extra pixels * -2*bsy-4 ... -2*bsy-1 | -bsy-bsy ... -bsy-1| -bsy -3 -2 -1 | 0 | 1 2 ... bsx | bsx+1 ... bsx+bsx | 2*bsx+1 ... 2*bsx+4 */ /* fill top & top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { /* fill top pixels */ if (bsx == 4) { memcpy(&EP[1], pT, bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)pT); _mm_storel_epi64((__m128i *)&EP[1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(pT + i)); _mm_store_si128((__m128i *)(&EP[1] + i), T1); } } } /* fill top-right pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_RIGHT)) { if (bsx == 4) { memcpy(&EP[bsx + 1], &pT[bsx], bsx * sizeof(pel_t)); } else if (bsx == 8) { T1 = _mm_loadu_si128((__m128i *)&pT[bsx]); _mm_storel_epi64((__m128i *)&EP[bsx + 1], T1); } else { for (i = 0; i < bsx; i += 16) { T1 = _mm_loadu_si128((__m128i *)(&pT[bsx + i])); _mm_store_si128((__m128i *)(&EP[bsx + 1] + i), T1); } } } else { if (bsx == 4) { memset(&EP[bsx + 1], EP[bsx], bsx); } else if (bsx == 8) { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel _mm_storel_epi64((__m128i *)&EP[bsx + 1], T0); } else { T0 = _mm_set1_epi8(EP[bsx]); // repeat the last pixel for (i = 0; i < bsx; i += 16) { _mm_store_si128((__m128i *)(&EP[bsx + 1 + i]), T0); } } } /* fill extra pixels */ num_padding = bsy * 11 / 4 - bsx + 4; if (num_padding > 0) { memset(&EP[2 * bsx + 1], EP[2 * bsx], num_padding); // from (2*bsx) to (iX + 3) = (bsy *11/4 + bsx - 1) + 3 } /* fill left & left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { const pel_t *p_l = pL; int y; /* fill left pixels */ for (y = 0; y < bsy; y++) { EP[-1 - y] = *p_l; p_l += i_TL; } } /* fill left-down pixels */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT_DOWN)) { const pel_t *p_l = pL + bsy * i_TL; int y; for (y = 0; y < bsy; y++) { EP[-bsy - 1 - y] = *p_l; p_l += i_TL; } } else { if (bsy == 4) { memset(&EP[-(bsy << 1)], EP[-bsy], bsy); } else if (bsy == 8) { T0 = _mm_set1_epi8(EP[-bsy]); _mm_storel_epi64((__m128i *)&EP[-(bsy << 1)], T0); } else { T0 = _mm_set1_epi8(EP[-bsy]); for (i = 0; i < bsy; i += 16) { _mm_storeu_si128((__m128i *)(&EP[-(bsy << 1)] + i), T0); } } } /* fill extra pixels */ num_padding = bsx * 11 / 4 - bsy + 4; if (num_padding > 0) { memset(&EP[-2 * bsy - num_padding], EP[-2 * bsy], num_padding); // from (-2*bsy) to (-iY - 3) = -(bsx *11/4 + bsy - 1) - 3 } /* fill EP[0] */ if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP_LEFT)) { EP[0] = pTL[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_TOP)) { EP[0] = pT[0]; } else if (IS_NEIGHBOR_AVAIL(i_avai, MD_I_LEFT)) { EP[0] = pL[0]; } } #endif // #if !HIGH_BIT_DEPTH davs2-1.6/source/common/vec/intrinsic_intra-pred.cc000066400000000000000000011551111337322544400224160ustar00rootroot00000000000000/* * intrinsic_intra-pred.cc * * Description of this file: * SSE assembly functions of Intra-Prediction module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #if !HIGH_BIT_DEPTH static ALIGN16(int8_t tab_coeff_mode_5[8][16]) = { { 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12, 20, 52, 44, 12 }, { 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24, 8, 40, 56, 24 }, { 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4, 28, 60, 36, 4 }, { 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16, 16, 48, 48, 16 }, { 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28, 4, 36, 60, 28 }, { 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8, 24, 56, 40, 8 }, { 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20, 12, 44, 52, 20 }, { 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0, 32, 64, 32, 0 } }; static uint8_t tab_idx_mode_5[64] = { 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 26, 27, 28, 30, 31, 33, 34, 35, 37, 38, 39, 41, 42, 44, 45, 46, 48, 49, 50, 52, 53, 55, 56, 57, 59, 60, 61, 63, 64, 66, 67, 68, 70, 71, 72, 74, 75, 77, 78, 79, 81, 82, 83, 85, 86, 88 }; /* --------------------------------------------------------------------------- */ void intra_pred_ver_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src + 1; __m128i T1, T2, T3, T4; UNUSED_PARAMETER(dir_mode); switch (bsx) { case 4: for (y = 0; y < bsy; y += 2) { CP32(dst, rpSrc); CP32(dst + i_dst, rpSrc); dst += i_dst << 1; } break; case 8: for (y = 0; y < bsy; y++) { CP64(dst, rpSrc); CP64(dst + i_dst, rpSrc); dst += i_dst << 1; } break; case 16: T1 = _mm_loadu_si128((__m128i*)rpSrc); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst), T1); dst += i_dst; } break; case 32: T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); T2 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst + 0), T1); _mm_storeu_si128((__m128i*)(dst + 16), T2); dst += i_dst; } break; case 64: T1 = _mm_loadu_si128((__m128i*)(rpSrc + 0)); T2 = _mm_loadu_si128((__m128i*)(rpSrc + 16)); T3 = _mm_loadu_si128((__m128i*)(rpSrc + 32)); T4 = _mm_loadu_si128((__m128i*)(rpSrc + 48)); for (y = 0; y < bsy; y++) { _mm_storeu_si128((__m128i*)(dst + 0), T1); _mm_storeu_si128((__m128i*)(dst + 16), T2); _mm_storeu_si128((__m128i*)(dst + 32), T3); _mm_storeu_si128((__m128i*)(dst + 48), T4); dst += i_dst; } break; default: assert(0); break; } } /* --------------------------------------------------------------------------- */ void intra_pred_hor_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int y; pel_t *rpSrc = src - 1; __m128i T; UNUSED_PARAMETER(dir_mode); switch (bsx) { case 4: for (y = 0; y < bsy; y++) { M32(dst) = 0x01010101 * rpSrc[-y]; dst += i_dst; } break; case 8: for (y = 0; y < bsy; y++) { M64(dst) = 0x0101010101010101 * rpSrc[-y]; dst += i_dst; } break; case 16: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst), T); dst += i_dst; } break; case 32: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst + 0), T); _mm_storeu_si128((__m128i*)(dst + 16), T); dst += i_dst; } break; case 64: for (y = 0; y < bsy; y++) { T = _mm_set1_epi8((char)rpSrc[-y]); _mm_storeu_si128((__m128i*)(dst + 0), T); _mm_storeu_si128((__m128i*)(dst + 16), T); _mm_storeu_si128((__m128i*)(dst + 32), T); _mm_storeu_si128((__m128i*)(dst + 48), T); dst += i_dst; } break; default: assert(0); break; } } /* --------------------------------------------------------------------------- */ void intra_pred_dc_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int avail_above = dir_mode >> 8; int avail_left = dir_mode & 0xFF; int dc_value; int sum_above = 0; int sum_left = 0; int x, y; pel_t *p_src; __m128i zero = _mm_setzero_si128(); __m128i S0; __m128i p00, p10, p20, p30; /* sum of left samples */ // for (y = 0; y < bsy; y++) dc_value += p_src[-y]; p_src = src - bsy; if (bsy == 4) { sum_left += p_src[0] + p_src[1] + p_src[2] + p_src[3]; } else if (bsy == 8) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_srli_si128(p00, 8); p00 = _mm_add_epi16(p00, p10); sum_left += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } else { p30 = zero; for (y = 0; y < bsy - 8; y += 16, p_src += 16) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_unpackhi_epi8(S0, zero); p20 = _mm_add_epi16(p00, p10); p30 = _mm_add_epi16(p30, p20); } p00 = _mm_srli_si128(p30, 8); p00 = _mm_add_epi16(p30, p00); sum_left += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } /* sum of above samples */ //for (x = 0; x < bsx; x++) dc_value += p_src[x]; p_src = src + 1; if (bsx == 4) { sum_above += p_src[0] + p_src[1] + p_src[2] + p_src[3]; } else if (bsx == 8) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_srli_si128(p00, 8); p00 = _mm_add_epi16(p00, p10); sum_above += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } else { p30 = zero; for (x = 0; x < bsx - 8; x += 16, p_src += 16) { S0 = _mm_loadu_si128((__m128i*)(p_src)); p00 = _mm_unpacklo_epi8(S0, zero); p10 = _mm_unpackhi_epi8(S0, zero); p20 = _mm_add_epi16(p00, p10); p30 = _mm_add_epi16(p30, p20); } p00 = _mm_srli_si128(p30, 8); p00 = _mm_add_epi16(p30, p00); sum_above += M128_U16(p00, 0) + M128_U16(p00, 1) + M128_U16(p00, 2) + M128_U16(p00, 3); } if (avail_left && avail_above) { x = bsx + bsy; dc_value = ((sum_above + sum_left + (x >> 1)) * (512 / x)) >> 9; } else if (avail_left) { dc_value = (sum_left + (bsy >> 1)) >> davs2_log2u(bsy); } else if (avail_above) { dc_value = (sum_above + (bsx >> 1)) >> davs2_log2u(bsx); } else { dc_value = g_dc_value; } p00 = _mm_set1_epi8((pel_t)dc_value); for (y = 0; y < bsy; y++) { if (bsx == 8) { _mm_storel_epi64((__m128i*)dst, p00); } else if (bsx == 4) { *(int*)(dst) = _mm_cvtsi128_si32(p00); } else { for (x = 0; x < bsx - 8; x += 16) { _mm_storeu_si128((__m128i*)(dst + x), p00); } } dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intra_pred_plane_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *rpSrc; int iH = 0; int iV = 0; int iA, iB, iC; int x, y; int iW2 = bsx >> 1; int iH2 = bsy >> 1; int ib_mult[5] = { 13, 17, 5, 11, 23 }; int ib_shift[5] = { 7, 10, 11, 15, 19 }; int im_h = ib_mult[tab_log2[bsx] - 2]; int is_h = ib_shift[tab_log2[bsx] - 2]; int im_v = ib_mult[tab_log2[bsy] - 2]; int is_v = ib_shift[tab_log2[bsy] - 2]; int iTmp; UNUSED_PARAMETER(dir_mode); rpSrc = src + iW2; for (x = 1; x < iW2 + 1; x++) { iH += x * (rpSrc[x] - rpSrc[-x]); } rpSrc = src - iH2; for (y = 1; y < iH2 + 1; y++) { iV += y * (rpSrc[-y] - rpSrc[y]); } iA = (src[-1 - (bsy - 1)] + src[1 + bsx - 1]) << 4; iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; __m128i TC, TB, TA, T_Start, T, D, D1; TA = _mm_set1_epi16((int16_t)iTmp); TB = _mm_set1_epi16((int16_t)iB); TC = _mm_set1_epi16((int16_t)iC); T_Start = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); T_Start = _mm_mullo_epi16(TB, T_Start); T_Start = _mm_add_epi16(T_Start, TA); TB = _mm_mullo_epi16(TB, _mm_set1_epi16(8)); if (bsx == 4) { for (y = 0; y < bsy; y++) { D = _mm_srai_epi16(T_Start, 5); D = _mm_packus_epi16(D, D); // extract low 32 bits from the packed result , and put it into a integer . (Redundant operation?) _mm_stream_si32((int *)dst, _mm_extract_epi32(D, 0)); T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 8) { for (y = 0; y < bsy; y++) { D = _mm_srai_epi16(T_Start, 5); D = _mm_packus_epi16(D, D); _mm_storel_epi64((__m128i*)dst, D); T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } else { for (y = 0; y < bsy; y++) { T = T_Start; for (x = 0; x < bsx; x += 16) { D = _mm_srai_epi16(T, 5); T = _mm_add_epi16(T, TB); D1 = _mm_srai_epi16(T, 5); T = _mm_add_epi16(T, TB); D = _mm_packus_epi16(D, D1); _mm_storeu_si128((__m128i*)(dst + x), D); } T_Start = _mm_add_epi16(T_Start, TC); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_bilinear_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int x, y; int ishift_x = tab_log2[bsx]; int ishift_y = tab_log2[bsy]; int ishift = DAVS2_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); int a, b, c, w, val; pel_t *p; __m128i T, T1, T2, T3, C1, C2, ADD; __m128i ZERO = _mm_setzero_si128(); /* TODO: Ϊʲô⼸ĴСҪӵ 32ǷбҪ */ ALIGN32(itr_t pTop [MAX_CU_SIZE + 32]); ALIGN32(itr_t pLeft[MAX_CU_SIZE + 32]); ALIGN32(itr_t pT [MAX_CU_SIZE + 32]); ALIGN32(itr_t pL [MAX_CU_SIZE + 32]); ALIGN32(itr_t wy [MAX_CU_SIZE + 32]); UNUSED_PARAMETER(dir_mode); p = src + 1; for (x = 0; x < bsx; x += 16) { T = _mm_loadu_si128((__m128i*)(p + x)); T1 = _mm_unpacklo_epi8(T, ZERO); T2 = _mm_unpackhi_epi8(T, ZERO); _mm_store_si128((__m128i*)(pTop + x), T1); _mm_store_si128((__m128i*)(pTop + x + 8), T2); } for (y = 0; y < bsy; y++) { pLeft[y] = src[-1 - y]; } a = pTop[bsx - 1]; b = pLeft[bsy - 1]; if (bsx == bsy) { c = (a + b + 1) >> 1; } else { c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); } w = (c << 1) - a - b; T = _mm_set1_epi16((int16_t)b); for (x = 0; x < bsx; x += 8) { T1 = _mm_load_si128((__m128i*)(pTop + x)); T2 = _mm_sub_epi16(T, T1); T1 = _mm_slli_epi16(T1, ishift_y); _mm_store_si128((__m128i*)(pT + x), T2); _mm_store_si128((__m128i*)(pTop + x), T1); } T = _mm_set1_epi16((int16_t)a); for (y = 0; y < bsy; y += 8) { T1 = _mm_load_si128((__m128i*)(pLeft + y)); T2 = _mm_sub_epi16(T, T1); T1 = _mm_slli_epi16(T1, ishift_x); _mm_store_si128((__m128i*)(pL + y), T2); _mm_store_si128((__m128i*)(pLeft + y), T1); } T = _mm_set1_epi16((int16_t)w); T = _mm_mullo_epi16(T, _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0)); T1 = _mm_set1_epi16((int16_t)(8 * w)); for (y = 0; y < bsy; y += 8) { _mm_store_si128((__m128i*)(wy + y), T); T = _mm_add_epi16(T, T1); } C1 = _mm_set_epi32(3, 2, 1, 0); C2 = _mm_set1_epi32(4); if (bsx == 4) { __m128i pTT = _mm_loadl_epi64((__m128i*)pT); T = _mm_loadl_epi64((__m128i*)pTop); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); T = _mm_add_epi16(T, pTT); T1 = _mm_cvtepi16_epi32(T); T1 = _mm_slli_epi32(T1, ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); T1 = _mm_packus_epi32(T1, T1); T1 = _mm_packus_epi16(T1, T1); M32(dst) = _mm_cvtsi128_si32(T1); dst += i_dst; } } else if (bsx == 8) { __m128i pTT = _mm_load_si128((__m128i*)pT); T = _mm_load_si128((__m128i*)pTop); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); T3 = _mm_mullo_epi32(C2, ADD); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); T = _mm_add_epi16(T, pTT); T1 = _mm_cvtepi16_epi32(T); T2 = _mm_cvtepi16_epi32(_mm_srli_si128(T, 8)); T1 = _mm_slli_epi32(T1, ishift_x); T2 = _mm_slli_epi32(T2, ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T2 = _mm_add_epi32(T2, ADD); T2 = _mm_srai_epi32(T2, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T1 = _mm_packus_epi32(T1, T2); T1 = _mm_packus_epi16(T1, T1); _mm_storel_epi64((__m128i*)dst, T1); dst += i_dst; } } else { __m128i TT[16]; __m128i PTT[16]; for (x = 0; x < bsx; x += 8) { int idx = x >> 2; __m128i M0 = _mm_load_si128((__m128i*)(pTop + x)); __m128i M1 = _mm_load_si128((__m128i*)(pT + x)); TT[idx] = _mm_unpacklo_epi16(M0, ZERO); TT[idx + 1] = _mm_unpackhi_epi16(M0, ZERO); PTT[idx] = _mm_cvtepi16_epi32(M1); PTT[idx + 1] = _mm_cvtepi16_epi32(_mm_srli_si128(M1, 8)); } for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm_set1_epi32(add); T3 = _mm_mullo_epi32(C2, ADD); ADD = _mm_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm_add_epi32(ADD, _mm_set1_epi32(val)); for (x = 0; x < bsx; x += 8) { int idx = x >> 2; TT[idx] = _mm_add_epi32(TT[idx], PTT[idx]); TT[idx + 1] = _mm_add_epi32(TT[idx + 1], PTT[idx + 1]); T1 = _mm_slli_epi32(TT[idx], ishift_x); T2 = _mm_slli_epi32(TT[idx + 1], ishift_x); T1 = _mm_add_epi32(T1, ADD); T1 = _mm_srai_epi32(T1, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T2 = _mm_add_epi32(T2, ADD); T2 = _mm_srai_epi32(T2, ishift_xy); ADD = _mm_add_epi32(ADD, T3); T1 = _mm_packus_epi32(T1, T2); T1 = _mm_packus_epi16(T1, T1); _mm_storel_epi64((__m128i*)(dst + x), T1); } dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_3_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if ((bsy > 4) && (bsx > 8)) { ALIGN16(pel_t first_line[(64 + 176 + 16) << 2]); int line_size = bsx + (((bsy - 4) * 11) >> 2); #if !BUGFIX_PREDICTION_INTRA int iW2 = bsx * 2 - 1; int real_size = DAVS2_MIN(line_size, iW2 + 1); #endif int aligned_line_size = 64 + 176 + 16; int i; pel_t *pfirst[4]; #if !BUGFIX_PREDICTION_INTRA pel_t *src_org = src; #endif pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src += 16) { #else for (i = 0; i < real_size - 8; i += 16, src += 16) { #endif __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i H2 = L10; __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); __m128i H3 = L11; SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); __m128i H4 = L12; SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); __m128i H5 = L13; SS11 = _mm_srli_si128(SS11, 1); __m128i H6 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H7 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H8 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H9 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H10 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H11 = _mm_unpacklo_epi8(SS11, zero); __m128i SS20 = _mm_loadu_si128((__m128i*)(src + 20)); __m128i H12 = _mm_unpacklo_epi8(SS20, zero); SS20 = _mm_srli_si128(SS20, 1); __m128i H13 = _mm_unpacklo_epi8(SS20, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_add_epi16(H2, coeff8); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_mullo_epi16(H12, coeff2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); } #if !BUGFIX_PREDICTION_INTRA // padding if (((real_size - 1) + 11) > iW2) { src = src_org + iW2; //needn't calculate pad using the value src. If pad is invalid, we won't use in "for (i = start1; i < line_size; i += 16)".Else pad is valid. __m128i pad1 = _mm_set1_epi8(pfirst[0][iW2 - 2]); __m128i pad2 = _mm_set1_epi8(pfirst[1][iW2 - 5]); __m128i pad3 = _mm_set1_epi8(pfirst[2][iW2 - 8]); __m128i pad4 = _mm_set1_epi8(pfirst[3][iW2 - 11]); int start1 = iW2 - 1; int start2 = iW2 - 4; int start3 = iW2 - 7; int start4 = iW2 - 10; for (i = start1; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[0][i], pad1); } for (i = start2; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[1][i], pad2); } for (i = start3; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[2][i], pad3); } for (i = start4; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[3][i], pad4); } } #endif bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i H2 = L10; __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); __m128i H3 = L11; SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); __m128i H4 = L12; SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); __m128i H5 = L13; SS11 = _mm_srli_si128(SS11, 1); __m128i H6 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H7 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H8 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H9 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H10 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i H11 = _mm_unpacklo_epi8(SS11, zero); __m128i SS20 = _mm_loadu_si128((__m128i*)(src + 20)); __m128i H12 = _mm_unpacklo_epi8(SS20, zero); SS20 = _mm_srli_si128(SS20, 1); __m128i H13 = _mm_unpacklo_epi8(SS20, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_add_epi16(H2, coeff8); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_mullo_epi16(H12, coeff2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else if (bsx == 8) { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L9 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L10 = _mm_unpacklo_epi8(SS2, zero); __m128i SS11 = _mm_loadu_si128((__m128i*)(src + 11)); __m128i L11 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L12 = _mm_unpacklo_epi8(SS11, zero); SS11 = _mm_srli_si128(SS11, 1); __m128i L13 = _mm_unpacklo_epi8(SS11, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst3, p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_mullo_epi16(L12, coeff2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst4, p00); #if BUGFIX_PREDICTION_INTRA __m128i pad1 = _mm_set1_epi8(src[16]); #else dst4[5] = dst4[4]; dst4[6] = dst4[4]; dst4[7] = dst4[4]; __m128i pad1 = _mm_set1_epi8((pel_t)((src[15] + 5 * src[16] + 7 * src[17] + 3 * src[18] + 8) >> 4)); __m128i pad2 = _mm_set1_epi8((pel_t)((src[15] + 3 * src[16] + 3 * src[17] + 1 * src[18] + 4) >> 3)); __m128i pad3 = _mm_set1_epi8(dst3[7]); __m128i pad4 = _mm_set1_epi8(dst4[4]); #endif dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; #if BUGFIX_PREDICTION_INTRA _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); #else _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad2); _mm_storel_epi64((__m128i*)dst3, pad3); _mm_storel_epi64((__m128i*)dst4, pad4); #endif dst1[0] = (pel_t)((src[13] + 5 * src[14] + 7 * src[15] + 3 * src[16] + 8) >> 4); dst1[1] = (pel_t)((src[14] + 5 * src[15] + 7 * src[16] + 3 * src[17] + 8) >> 4); dst1[2] = (pel_t)((src[15] + 5 * src[16] + 7 * src[17] + 3 * src[18] + 8) >> 4); if (bsy == 32) { for (int i = 0; i < 6; i++) { dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; #if BUGFIX_PREDICTION_INTRA _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); #else _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad2); _mm_storel_epi64((__m128i*)dst3, pad3); _mm_storel_epi64((__m128i*)dst4, pad4); #endif } } } else { if (bsy == 16) { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); #if BUGFIX_PREDICTION_INTRA __m128i pad1 = _mm_set1_epi8(src[8]); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); #else dst2[3] = dst2[2]; __m128i pad1 = _mm_set1_epi8((pel_t)((src[7] + 5 * src[8] + 7 * src[9] + 3 * src[10] + 8) >> 4)); __m128i pad2 = _mm_set1_epi8(dst2[2]); __m128i pad3 = _mm_set1_epi8((pel_t)((3 * src[7] + 7 * src[8] + 5 * src[9] + src[10] + 8) >> 4)); __m128i pad4 = _mm_set1_epi8((pel_t)((src[7] + 2 * src[8] + src[9] + 2) >> 2)); *((int*)(dst3)) = _mm_cvtsi128_si32(pad3); *((int*)(dst4)) = _mm_cvtsi128_si32(pad4); #endif for (int i = 0; i < 3; i++) { dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; #if BUGFIX_PREDICTION_INTRA *((int*)(dst1)) = _mm_cvtsi128_si32(pad1); *((int*)(dst2)) = _mm_cvtsi128_si32(pad1); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); #else *((int*)(dst1)) = _mm_cvtsi128_si32(pad1); *((int*)(dst2)) = _mm_cvtsi128_si32(pad2); *((int*)(dst3)) = _mm_cvtsi128_si32(pad3); *((int*)(dst4)) = _mm_cvtsi128_si32(pad4); #endif } } else { __m128i p00, p10, p20, p30; __m128i SS2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L2 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L3 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L4 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L5 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L6 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L7 = _mm_unpacklo_epi8(SS2, zero); SS2 = _mm_srli_si128(SS2, 1); __m128i L8 = _mm_unpacklo_epi8(SS2, zero); p00 = _mm_add_epi16(L2, coeff8); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); #if BUGFIX_PREDICTION_INTRA __m128i pad1 = _mm_set1_epi8(src[8]); *((int*)(dst3)) = _mm_cvtsi128_si32(pad1); *((int*)(dst4)) = _mm_cvtsi128_si32(pad1); #else dst2[3] = dst2[2]; dst3[0] = (pel_t)((3 * src[7] + 7 * src[8] + 5 * src[9] + src[10] + 8) >> 4); dst3[1] = dst3[0]; dst3[2] = dst3[0]; dst3[3] = dst3[0]; dst4[0] = (pel_t)((src[7] + 2 * src[8] + src[9] + 2) >> 2); dst4[1] = dst4[0]; dst4[2] = dst4[0]; dst4[3] = dst4[0]; #endif } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_4_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, bsx * 2 - 2); #endif int iHeight2 = bsy << 1; int i; __m128i zero = _mm_setzero_si128(); __m128i offset = _mm_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src += 16) { #else for (i = 0; i < real_size - 8; i += 16, src += 16) { #endif __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, offset); sum3 = _mm_add_epi16(sum3, offset); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, offset); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } #if !BUGFIX_PREDICTION_INTRA // padding for (i = real_size; i < line_size; i += 16) { __m128i pad = _mm_set1_epi8(first_line[real_size - 1]); _mm_storeu_si128((__m128i*)&first_line[i], pad); } #endif if (bsx == bsy || bsx > 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst = dst1 + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_5_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (((bsy > 4) && (bsx > 8))) { ALIGN16(pel_t first_line[(64 + 80 + 16) << 3]); int line_size = bsx + ((bsy - 8) >> 3) * 11; #if !BUGFIX_PREDICTION_INTRA int iW2 = bsx * 2 - 1; int real_size = DAVS2_MIN(line_size, iW2 + 1); #endif int aligned_line_size = (((line_size + 15) >> 4) << 4) + 16; pel_t *pfirst[8]; #if !BUGFIX_PREDICTION_INTRA pel_t *src_org = src; #endif pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src += 16) { #else for (i = 0; i < real_size - 8; i += 16, src += 16) { #endif __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L9 = _mm_unpacklo_epi8(SS1, zero); __m128i H1 = L9; __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i L10 = _mm_unpacklo_epi8(SS10, zero); __m128i H2 = L10; SS10 = _mm_srli_si128(SS10, 1); __m128i L11 = _mm_unpacklo_epi8(SS10, zero); __m128i H3 = L11; SS10 = _mm_srli_si128(SS10, 1); __m128i L12 = _mm_unpacklo_epi8(SS10, zero); __m128i H4 = L12; SS10 = _mm_srli_si128(SS10, 1); __m128i L13 = _mm_unpacklo_epi8(SS10, zero); __m128i H5 = L13; SS10 = _mm_srli_si128(SS10, 1); __m128i H6 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H7 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H8 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H9 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H10 = _mm_unpacklo_epi8(SS10, zero); __m128i SS19 = _mm_loadu_si128((__m128i*)(src + 19)); __m128i H11 = _mm_unpacklo_epi8(SS19, zero); SS19 = _mm_srli_si128(SS19, 1); __m128i H12 = _mm_unpacklo_epi8(SS19, zero); SS19 = _mm_srli_si128(SS19, 1); __m128i H13 = _mm_unpacklo_epi8(SS19, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H1, coeff5); p11 = _mm_mullo_epi16(H2, coeff13); p21 = _mm_mullo_epi16(H3, coeff11); p31 = _mm_mullo_epi16(H4, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(H2, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H4, coeff7); p11 = _mm_mullo_epi16(H5, coeff15); p21 = _mm_mullo_epi16(H6, coeff9); p31 = _mm_add_epi16(H7, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); p00 = _mm_add_epi16(L6, coeff16); p10 = _mm_mullo_epi16(L7, coeff9); p20 = _mm_mullo_epi16(L8, coeff15); p30 = _mm_mullo_epi16(L9, coeff7); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_add_epi16(H6, coeff16); p11 = _mm_mullo_epi16(H7, coeff9); p21 = _mm_mullo_epi16(H8, coeff15); p31 = _mm_mullo_epi16(H9, coeff7); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[4][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H8, coeff3); p11 = _mm_mullo_epi16(H9, coeff7); p21 = _mm_mullo_epi16(H10, coeff5); p31 = _mm_add_epi16(H11, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[5][i], p00); p00 = _mm_mullo_epi16(L9, coeff3); p10 = _mm_mullo_epi16(L10, coeff11); p20 = _mm_mullo_epi16(L11, coeff13); p30 = _mm_mullo_epi16(L12, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H9, coeff3); p11 = _mm_mullo_epi16(H10, coeff11); p21 = _mm_mullo_epi16(H11, coeff13); p31 = _mm_mullo_epi16(H12, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[6][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_add_epi16(L12, L12); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H11, H13); p11 = _mm_add_epi16(H12, H12); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[7][i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L9 = _mm_unpacklo_epi8(SS1, zero); __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i L10 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L11 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L12 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i L13 = _mm_unpacklo_epi8(SS10, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); p00 = _mm_add_epi16(L6, coeff16); p10 = _mm_mullo_epi16(L7, coeff9); p20 = _mm_mullo_epi16(L8, coeff15); p30 = _mm_mullo_epi16(L9, coeff7); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[4][i], p00); p00 = _mm_mullo_epi16(L8, coeff3); p10 = _mm_mullo_epi16(L9, coeff7); p20 = _mm_mullo_epi16(L10, coeff5); p30 = _mm_add_epi16(L11, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[5][i], p00); p00 = _mm_mullo_epi16(L9, coeff3); p10 = _mm_mullo_epi16(L10, coeff11); p20 = _mm_mullo_epi16(L11, coeff13); p30 = _mm_mullo_epi16(L12, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[6][i], p00); p00 = _mm_add_epi16(L11, L13); p10 = _mm_add_epi16(L12, L12); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[7][i], p00); } #if !BUGFIX_PREDICTION_INTRA //padding if (real_size + 10 > iW2) { src = src_org + iW2; //needn't calculate pad using the value src. If pad is invalid, we won't use in "for (i = start1; i < line_size; i += 16)".Else pad is valid. __m128i pad1 = _mm_set1_epi8(pfirst[0][iW2 - 1]); __m128i pad2 = _mm_set1_epi8(pfirst[1][iW2 - 2]); __m128i pad3 = _mm_set1_epi8(pfirst[2][iW2 - 4]); __m128i pad4 = _mm_set1_epi8(pfirst[3][iW2 - 5]); __m128i pad5 = _mm_set1_epi8(pfirst[4][iW2 - 6]); __m128i pad6 = _mm_set1_epi8(pfirst[5][iW2 - 8]); __m128i pad7 = _mm_set1_epi8(pfirst[6][iW2 - 9]); __m128i pad8 = _mm_set1_epi8(pfirst[7][iW2 - 11]); int start1 = iW2; int start2 = iW2 - 1; int start3 = iW2 - 3; int start4 = iW2 - 4; int start5 = iW2 - 5; int start6 = iW2 - 7; int start7 = iW2 - 8; int start8 = iW2 - 10; for (i = start1; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[0][i], pad1); } for (i = start2; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[1][i], pad2); } for (i = start3; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[2][i], pad3); } for (i = start4; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[3][i], pad4); } for (i = start5; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[4][i], pad5); } for (i = start6; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[5][i], pad6); } for (i = start7; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[6][i], pad7); } for (i = start8; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[7][i], pad8); } } #endif bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i H1 = _mm_unpacklo_epi8(SS1, zero); __m128i SS10 = _mm_loadu_si128((__m128i*)(src + 10)); __m128i H2 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H3 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H4 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H5 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H6 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H7 = _mm_unpacklo_epi8(SS10, zero); SS10 = _mm_srli_si128(SS10, 1); __m128i H8 = _mm_unpacklo_epi8(SS10, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H1, coeff5); p11 = _mm_mullo_epi16(H2, coeff13); p21 = _mm_mullo_epi16(H3, coeff11); p31 = _mm_mullo_epi16(H4, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H3, coeff5); p21 = _mm_mullo_epi16(H4, coeff7); p31 = _mm_mullo_epi16(H5, coeff3); p01 = _mm_add_epi16(H2, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H4, coeff7); p11 = _mm_mullo_epi16(H5, coeff15); p21 = _mm_mullo_epi16(H6, coeff9); p31 = _mm_add_epi16(H7, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H5, H8); p11 = _mm_add_epi16(H6, H7); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else if (bsx == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; for (i = 0; i < 8; src++, i++) { dst1[i] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst2[i] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst3[i] = (pel_t)((7 * src[4] + 15 * src[5] + 9 * src[6] + 1 * src[7] + 16) >> 5); dst4[i] = (pel_t)((src[5] + 3 * src[6] + 3 * src[7] + 1 * src[8] + 4) >> 3); dst5[i] = (pel_t)((src[6] + 9 * src[7] + 15 * src[8] + 7 * src[9] + 16) >> 5); dst6[i] = (pel_t)((3 * src[8] + 7 * src[9] + 5 * src[10] + src[11] + 8) >> 4); dst7[i] = (pel_t)((3 * src[9] + 11 * src[10] + 13 * src[11] + 5 * src[12] + 16) >> 5); dst8[i] = (pel_t)((src[11] + 2 * src[12] + src[13] + 2) >> 2); } #if !BUGFIX_PREDICTION_INTRA dst7[7] = dst7[6]; dst8[7] = dst8[4]; dst8[6] = dst8[4]; dst8[5] = dst8[4]; #endif if (bsy == 32) { //src -> 8,src[7] -> 15 #if BUGFIX_PREDICTION_INTRA __m128i pad1 = _mm_set1_epi8(src[8]); #else __m128i pad1 = _mm_set1_epi8((pel_t)((5 * src[7] + 13 * src[8] + 11 * src[9] + 3 * src[10] + 16) >> 5)); __m128i pad2 = _mm_set1_epi8((pel_t)((src[7] + 5 * src[8] + 7 * src[9] + 3 * src[10] + 8) >> 4)); __m128i pad3 = _mm_set1_epi8((pel_t)((7 * src[7] + 15 * src[8] + 9 * src[9] + 1 * src[10] + 16) >> 5)); __m128i pad4 = _mm_set1_epi8((pel_t)((src[7] + 3 * src[8] + 3 * src[9] + 1 * src[10] + 4) >> 3)); __m128i pad5 = _mm_set1_epi8((pel_t)((src[7] + 9 * src[8] + 15 * src[9] + 7 * src[10] + 16) >> 5)); __m128i pad6 = _mm_set1_epi8(dst6[7]); __m128i pad7 = _mm_set1_epi8(dst7[7]); __m128i pad8 = _mm_set1_epi8(dst8[7]); #endif dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; #if BUGFIX_PREDICTION_INTRA _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); #else _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad2); _mm_storel_epi64((__m128i*)dst3, pad3); _mm_storel_epi64((__m128i*)dst4, pad4); _mm_storel_epi64((__m128i*)dst5, pad5); _mm_storel_epi64((__m128i*)dst6, pad6); _mm_storel_epi64((__m128i*)dst7, pad7); _mm_storel_epi64((__m128i*)dst8, pad8); #endif src += 4; dst1[0] = (pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5); dst1[1] = (pel_t)((5 * src[1] + 13 * src[2] + 11 * src[3] + 3 * src[4] + 16) >> 5); dst1[2] = (pel_t)((5 * src[2] + 13 * src[3] + 11 * src[4] + 3 * src[5] + 16) >> 5); dst1[3] = (pel_t)((5 * src[3] + 13 * src[4] + 11 * src[5] + 3 * src[6] + 16) >> 5); dst2[0] = (pel_t)((src[1] + 5 * src[2] + 7 * src[3] + 3 * src[4] + 8) >> 4); dst2[1] = (pel_t)((src[2] + 5 * src[3] + 7 * src[4] + 3 * src[5] + 8) >> 4); dst2[2] = (pel_t)((src[3] + 5 * src[4] + 7 * src[5] + 3 * src[6] + 8) >> 4); dst3[0] = (pel_t)((7 * src[3] + 15 * src[4] + 9 * src[5] + src[6] + 16) >> 5); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; #if BUGFIX_PREDICTION_INTRA _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); #else _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad2); _mm_storel_epi64((__m128i*)dst3, pad3); _mm_storel_epi64((__m128i*)dst4, pad4); _mm_storel_epi64((__m128i*)dst5, pad5); _mm_storel_epi64((__m128i*)dst6, pad6); _mm_storel_epi64((__m128i*)dst7, pad7); _mm_storel_epi64((__m128i*)dst8, pad8); #endif dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; #if BUGFIX_PREDICTION_INTRA _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad1); _mm_storel_epi64((__m128i*)dst3, pad1); _mm_storel_epi64((__m128i*)dst4, pad1); _mm_storel_epi64((__m128i*)dst5, pad1); _mm_storel_epi64((__m128i*)dst6, pad1); _mm_storel_epi64((__m128i*)dst7, pad1); _mm_storel_epi64((__m128i*)dst8, pad1); #else _mm_storel_epi64((__m128i*)dst1, pad1); _mm_storel_epi64((__m128i*)dst2, pad2); _mm_storel_epi64((__m128i*)dst3, pad3); _mm_storel_epi64((__m128i*)dst4, pad4); _mm_storel_epi64((__m128i*)dst5, pad5); _mm_storel_epi64((__m128i*)dst6, pad6); _mm_storel_epi64((__m128i*)dst7, pad7); _mm_storel_epi64((__m128i*)dst8, pad8); #endif } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i SS1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L1 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L2 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L3 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L4 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L5 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L6 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L7 = _mm_unpacklo_epi8(SS1, zero); SS1 = _mm_srli_si128(SS1, 1); __m128i L8 = _mm_unpacklo_epi8(SS1, zero); p00 = _mm_mullo_epi16(L1, coeff5); p10 = _mm_mullo_epi16(L2, coeff13); p20 = _mm_mullo_epi16(L3, coeff11); p30 = _mm_mullo_epi16(L4, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst1)) = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L3, coeff5); p20 = _mm_mullo_epi16(L4, coeff7); p30 = _mm_mullo_epi16(L5, coeff3); p00 = _mm_add_epi16(L2, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst2)) = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L4, coeff7); p10 = _mm_mullo_epi16(L5, coeff15); p20 = _mm_mullo_epi16(L6, coeff9); p30 = _mm_add_epi16(L7, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst3)) = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L5, L8); p10 = _mm_add_epi16(L6, L7); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); *((int*)(dst4)) = _mm_cvtsi128_si32(p00); #if !BUGFIX_PREDICTION_INTRA dst4[3] = dst4[2]; #endif if (bsy == 16) { pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; src += 8; #if BUGFIX_PREDICTION_INTRA __m128i pad1 = _mm_set1_epi8(src[0]); *(int*)(dst5) = _mm_cvtsi128_si32(pad1); *(int*)(dst6) = _mm_cvtsi128_si32(pad1); *(int*)(dst7) = _mm_cvtsi128_si32(pad1); *(int*)(dst8) = _mm_cvtsi128_si32(pad1); #else __m128i pad1 = _mm_set1_epi8((pel_t)((5 * src[0] + 13 * src[1] + 11 * src[2] + 3 * src[3] + 16) >> 5)); __m128i pad2 = _mm_set1_epi8((pel_t)((src[0] + 5 * src[1] + 7 * src[2] + 3 * src[3] + 8) >> 4)); __m128i pad3 = _mm_set1_epi8((pel_t)((7 * src[0] + 15 * src[1] + 9 * src[2] + 1 * src[3] + 16) >> 5)); __m128i pad4 = _mm_set1_epi8(dst4[3]); __m128i pad5 = _mm_set1_epi8((pel_t)((src[0] + 9 * src[1] + 15 * src[2] + 7 * src[3] + 16) >> 5)); __m128i pad6 = _mm_set1_epi8((pel_t)((3 * src[0] + 7 * src[1] + 5 * src[2] + src[3] + 8) >> 4)); __m128i pad7 = _mm_set1_epi8((pel_t)((3 * src[0] + 11 * src[1] + 13 * src[2] + 5 * src[3] + 16) >> 5)); __m128i pad8 = _mm_set1_epi8((pel_t)((src[0] + 2 * src[1] + src[2] + 2) >> 2)); *(int*)(dst5) = _mm_cvtsi128_si32(pad5); *(int*)(dst6) = _mm_cvtsi128_si32(pad6); *(int*)(dst7) = _mm_cvtsi128_si32(pad7); *(int*)(dst8) = _mm_cvtsi128_si32(pad8); #endif dst5[0] = (pel_t)((src[-2] + 9 * src[-1] + 15 * src[0] + 7 * src[1] + 16) >> 5); dst5[1] = (pel_t)((src[-1] + 9 * src[0] + 15 * src[1] + 7 * src[2] + 16) >> 5); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; #if BUGFIX_PREDICTION_INTRA *(int*)(dst1) = _mm_cvtsi128_si32(pad1); *(int*)(dst2) = _mm_cvtsi128_si32(pad1); *(int*)(dst3) = _mm_cvtsi128_si32(pad1); *(int*)(dst4) = _mm_cvtsi128_si32(pad1); *(int*)(dst5) = _mm_cvtsi128_si32(pad1); *(int*)(dst6) = _mm_cvtsi128_si32(pad1); *(int*)(dst7) = _mm_cvtsi128_si32(pad1); *(int*)(dst8) = _mm_cvtsi128_si32(pad1); #else *(int*)(dst1) = _mm_cvtsi128_si32(pad1); *(int*)(dst2) = _mm_cvtsi128_si32(pad2); *(int*)(dst3) = _mm_cvtsi128_si32(pad3); *(int*)(dst4) = _mm_cvtsi128_si32(pad4); *(int*)(dst5) = _mm_cvtsi128_si32(pad5); *(int*)(dst6) = _mm_cvtsi128_si32(pad6); *(int*)(dst7) = _mm_cvtsi128_si32(pad7); *(int*)(dst8) = _mm_cvtsi128_si32(pad8); #endif } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_6_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, bsx * 2 - 1); #endif int i; __m128i zero = _mm_setzero_si128(); __m128i offset = _mm_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 2; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src += 16) { #else for (i = 0; i < real_size - 8; i += 16, src += 16) { #endif __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, offset); sum3 = _mm_add_epi16(sum3, offset); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, offset); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } #if !BUGFIX_PREDICTION_INTRA // padding for (i = real_size; i < line_size; i += 16) { __m128i pad = _mm_set1_epi8(first_line[real_size - 1]); _mm_storeu_si128((__m128i*)&first_line[i], pad); } #endif if (bsx > 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2; if (bsy == 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } else { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst2 = dst1 + i_dst; dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; M = _mm_loadu_si128((__m128i*)&first_line[16]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } } else { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_7_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = tab_idx_mode_7[j + 1]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j + 1]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0= _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; int idx = tab_idx_mode_7[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } else { if (bsx & 0x07) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); D0 = _mm_hadds_epi16(t0, zero); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else if (bsx & 0x0f) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_7[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_7[j][0] + src[iWidth2 + 1] * tab_coeff_mode_7[j][1] + src[iWidth2 + 2] * tab_coeff_mode_7[j][2] + src[iWidth2 + 3] * tab_coeff_mode_7[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; } break; } else { __m128i D0, D1; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_7[j]); for (i = 0; i < real_width; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_store_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); for (i = real_width; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = dst[real_width - 1]; } } } dst += i_dst; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_8_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + (bsy >> 1) - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, (bsx << 1)); #endif int i; #if !BUGFIX_PREDICTION_INTRA __m128i pad1, pad2; #endif int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m128i zero = _mm_setzero_si128(); __m128i coeff = _mm_set1_epi16(3); __m128i offset1 = _mm_set1_epi16(4); __m128i offset2 = _mm_set1_epi16(2); int i_dst2 = i_dst * 2; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src += 16) { #else for (i = 0; i < real_size - 8; i += 16, src += 16) { #endif __m128i p01, p02, p11, p12; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_add_epi16(L1, L2); p01 = _mm_mullo_epi16(p01, coeff); p02 = _mm_add_epi16(L0, L3); p02 = _mm_add_epi16(p02, offset1); p01 = _mm_add_epi16(p01, p02); p01 = _mm_srli_epi16(p01, 3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff); p12 = _mm_add_epi16(H0, H3); p12 = _mm_add_epi16(p12, offset1); p11 = _mm_add_epi16(p11, p12); p11 = _mm_srli_epi16(p11, 3); p01 = _mm_packus_epi16(p01, p11); _mm_store_si128((__m128i*)&pfirst[0][i], p01); p01 = _mm_add_epi16(L1, L2); p02 = _mm_add_epi16(L2, L3); p11 = _mm_add_epi16(H1, H2); p12 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p02); p11 = _mm_add_epi16(p11, p12); p01 = _mm_add_epi16(p01, offset2); p11 = _mm_add_epi16(p11, offset2); p01 = _mm_srli_epi16(p01, 2); p11 = _mm_srli_epi16(p11, 2); p01 = _mm_packus_epi16(p01, p11); _mm_store_si128((__m128i*)&pfirst[1][i], p01); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i p01, p02; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p01 = _mm_add_epi16(L1, L2); p01 = _mm_mullo_epi16(p01, coeff); p02 = _mm_add_epi16(L0, L3); p02 = _mm_add_epi16(p02, offset1); p01 = _mm_add_epi16(p01, p02); p01 = _mm_srli_epi16(p01, 3); p01 = _mm_packus_epi16(p01, p01); _mm_storel_epi64((__m128i*)&pfirst[0][i], p01); p01 = _mm_add_epi16(L1, L2); p02 = _mm_add_epi16(L2, L3); p01 = _mm_add_epi16(p01, p02); p01 = _mm_add_epi16(p01, offset2); p01 = _mm_srli_epi16(p01, 2); p01 = _mm_packus_epi16(p01, p01); _mm_storel_epi64((__m128i*)&pfirst[1][i], p01); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { pfirst[1][real_size - 1] = pfirst[1][real_size - 2]; pad1 = _mm_set1_epi8(pfirst[0][real_size - 1]); pad2 = _mm_set1_epi8(pfirst[1][real_size - 1]); for (i = real_size; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&pfirst[0][i], pad1); _mm_storeu_si128((__m128i*)&pfirst[1][i], pad2); } } #endif bsy >>= 1; if (bsx != 8) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else if (bsy == 4) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else { for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_9_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; int iWidth2 = bsx << 1; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = tab_idx_mode_9[j + 1]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j + 1]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0 = _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //_mm_maskmoveu_si128(D0, mask, (char*)(dst + i_dst)); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; int idx = tab_idx_mode_9[j]; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } else { if (bsx & 0x07) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); D0 = _mm_hadds_epi16(t0, zero); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else if (bsx & 0x0f) { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); _mm_storel_epi64((__m128i*)(dst), D0); dst += i_dst; } break; } else { __m128i D0; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); _mm_storel_epi64((__m128i*)(dst + real_width), D0); } } dst += i_dst; } } else { for (j = 0; j < bsy; j++) { int real_width; int idx = tab_idx_mode_9[j]; real_width = DAVS2_MIN(bsx, iWidth2 - idx + 1); if (real_width <= 0) { pel_t val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); __m128i D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; j++; for (; j < bsy; j++) { val = (pel_t)((src[iWidth2] * tab_coeff_mode_9[j][0] + src[iWidth2 + 1] * tab_coeff_mode_9[j][1] + src[iWidth2 + 2] * tab_coeff_mode_9[j][2] + src[iWidth2 + 3] * tab_coeff_mode_9[j][3] + 64) >> 7); D0 = _mm_set1_epi8((char)val); for (i = 0; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); } dst += i_dst; } break; } else { __m128i D0, D1; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_9[j]); for (i = 0; i < real_width; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_store_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } if (real_width < bsx) { D0 = _mm_set1_epi8((char)dst[real_width - 1]); for (i = real_width; i < bsx; i += 16) { _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = dst[real_width - 1]; } } } dst += i_dst; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_10_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); UNUSED_PARAMETER(dir_mode); if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&pfirst[3][i], p00); } if (i < line_size) { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); } bsy >>= 2; if (bsx != 8) { int i_dstx4 = i_dst << 2; switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst1, pfirst[0] + i); dst1 += i_dstx4; CP32(dst2, pfirst[1] + i); dst2 += i_dstx4; CP32(dst3, pfirst[2] + i); dst3 += i_dstx4; CP32(dst4, pfirst[3] + i); dst4 += i_dstx4; } break; case 16: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 16 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 16 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 16 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 16 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 32: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 32 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 32 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 32 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 32 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 64: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 64 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 64 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 64 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 64 * sizeof(pel_t)); dst4 += i_dstx4; } break; default: assert(0); break; } } else { if (bsy == 2) { for (i = 0; i < bsy; i++) { CP64(dst1, pfirst[0] + i); CP64(dst2, pfirst[1] + i); CP64(dst3, pfirst[2] + i); CP64(dst4, pfirst[3] + i); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); __m128i M3 = _mm_loadu_si128((__m128i*)&pfirst[2][0]); __m128i M4 = _mm_loadu_si128((__m128i*)&pfirst[3][0]); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); } } } else { if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst1))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst2))[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst3))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst4))[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_x_11_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j, idx; __m128i zero = _mm_setzero_si128(); __m128i S0, S1, S2, S3; __m128i t0, t1, t2, t3; __m128i off = _mm_set1_epi16(64); __m128i c0; UNUSED_PARAMETER(dir_mode); if (bsx & 0x07) { __m128i D0; int i_dst2 = i_dst << 1; for (j = 0; j < bsy; j += 2) { idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); idx = (j + 2) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[(j + 1) & 0x07]); S0 = _mm_loadl_epi64((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t1 = _mm_unpacklo_epi8(S0, S1); t2 = _mm_unpacklo_epi8(S2, S3); t1 = _mm_unpacklo_epi16(t1, t2); t1 = _mm_maddubs_epi16(t1, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, zero); ((uint32_t*)(dst))[0] = _mm_cvtsi128_si32(D0); D0 = _mm_srli_si128(D0, 4); ((uint32_t*)(dst + i_dst))[0] = _mm_cvtsi128_si32(D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst2; } } else if (bsx & 0x0f) { __m128i D0; for (j = 0; j < bsy; j++) { idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_srli_si128(S0, 1); S2 = _mm_srli_si128(S0, 2); S3 = _mm_srli_si128(S0, 3); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); D0 = _mm_packus_epi16(D0, _mm_setzero_si128()); _mm_storel_epi64((__m128i*)(dst), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); dst += i_dst; } } else { for (j = 0; j < bsy; j++) { __m128i D0, D1; idx = (j + 1) >> 3; c0 = _mm_load_si128((__m128i*)tab_coeff_mode_11[j & 0x07]); for (i = 0; i < bsx; i += 16, idx += 16) { S0 = _mm_loadu_si128((__m128i*)(src + idx)); S1 = _mm_loadu_si128((__m128i*)(src + idx + 1)); S2 = _mm_loadu_si128((__m128i*)(src + idx + 2)); S3 = _mm_loadu_si128((__m128i*)(src + idx + 3)); t0 = _mm_unpacklo_epi8(S0, S1); t1 = _mm_unpacklo_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D0 = _mm_hadds_epi16(t0, t1); D0 = _mm_add_epi16(D0, off); D0 = _mm_srli_epi16(D0, 7); t0 = _mm_unpackhi_epi8(S0, S1); t1 = _mm_unpackhi_epi8(S2, S3); t2 = _mm_unpacklo_epi16(t0, t1); t3 = _mm_unpackhi_epi16(t0, t1); t0 = _mm_maddubs_epi16(t2, c0); t1 = _mm_maddubs_epi16(t3, c0); D1 = _mm_hadds_epi16(t0, t1); D1 = _mm_add_epi16(D1, off); D1 = _mm_srli_epi16(D1, 7); D0 = _mm_packus_epi16(D0, D1); _mm_storeu_si128((__m128i*)(dst + i), D0); //dst[i] = (pel_t)((src[idx] * c1 + src[idx + 1] * c2 + src[idx + 2] * c3 + src[idx + 3] * c4 + 64) >> 7); } dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_25_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN16(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; pel_t *pfirst = first_line; __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[0]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[-2]); __m128i L3 = _mm_set1_epi16(src[-3]); src -= 4; for (i = 0; i < line_size - 24; i += 32, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L2 = _mm_set1_epi16(src[-2]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L3 = _mm_set1_epi16(src[-3]); } if (bsx == 16) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); } else { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); pfirst += 8; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst, p00); } for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[0]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[-2]); __m128i L3 = _mm_set1_epi16(src[-3]); src -= 4; bsy >>= 2; for (i = 0; i < bsy; i++, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L0 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L2 = _mm_set1_epi16(src[-2]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L3 = _mm_set1_epi16(src[-3]); } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); if (bsy == 4) { src -= 15; __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); } else { src -= 15; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M1 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M3 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); M7 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_26_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); ALIGN16(pel_t first_line[64 + 256]); int line_size = bsx + (bsy - 1) * 4; int iHeight4 = bsy << 2; src -= 15; for (i = 0; i < line_size - 32; i += 64, src -= 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M4); _mm_store_si128((__m128i*)&first_line[16 + i], M8); _mm_store_si128((__m128i*)&first_line[32 + i], M3); _mm_store_si128((__m128i*)&first_line[48 + i], M7); } if (i < line_size) { __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M4); _mm_store_si128((__m128i*)&first_line[16 + i], M8); } switch (bsx) { case 4: for (i = 0; i < iHeight4; i += 4) { CP32(dst, first_line + i); dst += i_dst; } break; case 8: for (i = 0; i < iHeight4; i += 4) { CP64(dst, first_line + i); dst += i_dst; } break; default: for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } break; } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); src -= 15; if (bsy == 4) { __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); } else { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M4); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M8); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M3); dst += i_dst; ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); ((int*)dst)[0] = _mm_cvtsi128_si32(M7); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_28_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int line_size = bsx + (bsy - 1) * 2; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, bsy * 4); #endif int i; int iHeight2 = bsy << 1; #if !BUGFIX_PREDICTION_INTRA __m128i pad; #endif __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= 15; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src -= 16) { #else for (i = 0; i < real_size - 16; i += 32, src -= 16) { #endif __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_adds_epi16(L1, L2); p01 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(L0, L3); p11 = _mm_add_epi16(L2, L3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i + 16], p00); p00 = _mm_adds_epi16(H1, H2); p01 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(H0, H3); p11 = _mm_add_epi16(H2, H3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 3)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_adds_epi16(H1, H2); p01 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_adds_epi16(H0, H3); p11 = _mm_add_epi16(H2, H3); p10 = _mm_adds_epi16(p10, coeff4); p00 = _mm_adds_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { i = real_size; first_line[i - 1] = first_line[i - 3]; pad = _mm_set1_epi16(((short*)&first_line[i - 2])[0]); for (; i < line_size; i += 16) { _mm_storeu_si128((__m128i*)&first_line[i], pad); } } #endif if (bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_30_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, bsy * 2 - 1); #endif int i; __m128i coeff2 = _mm_set1_epi16(2); __m128i shuffle = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= 17; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 8; i += 16, src -= 16) { #else for (i = 0; i < real_size - 8; i += 16, src -= 16) { #endif __m128i p00, p10, p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, p10); p01 = _mm_add_epi16(p01, p11); p00 = _mm_add_epi16(p00, coeff2); p01 = _mm_add_epi16(p01, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p01 = _mm_packus_epi16(p01, p01); p01 = _mm_shuffle_epi8(p01, shuffle); _mm_store_si128((__m128i*)&first_line[i], p01); } #if !BUGFIX_PREDICTION_INTRA // padding for (i = real_size; i < line_size; i += 16) { __m128i pad = _mm_set1_epi8(first_line[real_size - 1]); _mm_storeu_si128((__m128i*)&first_line[i], pad); } #endif if (bsx > 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2; if (bsy == 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } else { __m128i M = _mm_loadu_si128((__m128i*)&first_line[0]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst2 = dst1 + i_dst; dst1 = dst + 8; M = _mm_loadu_si128((__m128i*)&first_line[8]); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; dst2 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); _mm_storel_epi64((__m128i*)dst2, M); dst1 += i_dst; M = _mm_loadu_si128((__m128i*)&first_line[16]); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); dst1 += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst1, M); } } else if (bsx == 8) { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < bsy; i += 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_31_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t dst_tran[64 * 80]); ALIGN16(pel_t src_tran[64 * 8]); int i_dst2 = (((bsy + 15) >> 4) << 4) + 16; int i; UNUSED_PARAMETER(dir_mode); //transposition #if BUGFIX_PREDICTION_INTRA for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++) { #else for (i = 0; i < (2 * bsy + 3); i++) { #endif src_tran[i] = src[-i]; } intra_pred_ang_x_5_sse128(src_tran, dst_tran, i_dst2, 5, bsy, bsx); if ((bsy > 4) && (bsx > 4)) { pel_t *pDst_128[64]; pel_t *pTra_128[64]; int iSize_x = bsx >> 3; int iSize_y = bsy >> 3; int iSize = iSize_x * iSize_y; for (int y = 0; y < iSize_y; y++) { for (int x = 0; x < iSize_x; x++) { pDst_128[x + y * iSize_x] = dst + x * 8 + y * 8 * i_dst; pTra_128[x + y * iSize_x] = dst_tran + y * 8 + x * 8 * i_dst2; } } for (i = 0; i < iSize; i++) { pel_t *dst_tran_org = pTra_128[i]; pel_t *dst1 = pDst_128[i]; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3, Org_8_4, Org_8_5, Org_8_6, Org_8_7; __m128i p00, p10, p20, p30; __m128i t00, t10, t20, t30; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_4 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_5 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_6 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_7 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); p20 = _mm_unpacklo_epi8(Org_8_4, Org_8_5); p30 = _mm_unpacklo_epi8(Org_8_6, Org_8_7); t00 = _mm_unpacklo_epi16(p00, p10); t20 = _mm_unpacklo_epi16(p20, p30); t10 = _mm_unpackhi_epi16(p00, p10); t30 = _mm_unpackhi_epi16(p20, p30); p00 = _mm_unpacklo_epi32(t00, t20); p10 = _mm_unpackhi_epi32(t00, t20); p20 = _mm_unpacklo_epi32(t10, t30); p30 = _mm_unpackhi_epi32(t10, t30); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_srli_si128(p00, 8); _mm_storel_epi64((__m128i*)dst2, p00); _mm_storel_epi64((__m128i*)dst3, p10); p10 = _mm_srli_si128(p10, 8); _mm_storel_epi64((__m128i*)dst4, p10); _mm_storel_epi64((__m128i*)dst5, p20); p20 = _mm_srli_si128(p20, 8); _mm_storel_epi64((__m128i*)dst6, p20); _mm_storel_epi64((__m128i*)dst7, p30); p30 = _mm_srli_si128(p30, 8); _mm_storel_epi64((__m128i*)dst8, p30); } } else if (bsx == 16) { for (i = 0; i < 2; i++) { pel_t *dst_tran_org = dst_tran + i * 8 * i_dst2; pel_t *dst1 = dst + i * 8; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3, Org_8_4, Org_8_5, Org_8_6, Org_8_7; __m128i p00, p10, p20, p30; __m128i t00, t20; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_4 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_5 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_6 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_7 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); p20 = _mm_unpacklo_epi8(Org_8_4, Org_8_5); p30 = _mm_unpacklo_epi8(Org_8_6, Org_8_7); t00 = _mm_unpacklo_epi16(p00, p10); t20 = _mm_unpacklo_epi16(p20, p30); p00 = _mm_unpacklo_epi32(t00, t20); p10 = _mm_unpackhi_epi32(t00, t20); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_srli_si128(p00, 8); _mm_storel_epi64((__m128i*)dst2, p00); _mm_storel_epi64((__m128i*)dst3, p10); p10 = _mm_srli_si128(p10, 8); _mm_storel_epi64((__m128i*)dst4, p10); } } else if (bsy == 16) {//bsx == 4 pel_t *dst_tran_org = dst_tran; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3; __m128i p00, p10; __m128i t00, t10; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); t10 = _mm_unpackhi_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); *((int*)(dst5)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst6)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst7)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst8)) = _mm_cvtsi128_si32(t10); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; p00 = _mm_unpackhi_epi8(Org_8_0, Org_8_1); p10 = _mm_unpackhi_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); t10 = _mm_unpackhi_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); *((int*)(dst5)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst6)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst7)) = _mm_cvtsi128_si32(t10); t10 = _mm_srli_si128(t10, 4); *((int*)(dst8)) = _mm_cvtsi128_si32(t10); } else {// bsx == 4 bsy ==4 pel_t *dst_tran_org = dst_tran; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i Org_8_0, Org_8_1, Org_8_2, Org_8_3; __m128i p00, p10; __m128i t00; Org_8_0 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_1 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_2 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; Org_8_3 = _mm_loadu_si128((__m128i*)dst_tran_org); dst_tran_org += i_dst2; p00 = _mm_unpacklo_epi8(Org_8_0, Org_8_1); p10 = _mm_unpacklo_epi8(Org_8_2, Org_8_3); t00 = _mm_unpacklo_epi16(p00, p10); *((int*)(dst1)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst2)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst3)) = _mm_cvtsi128_si32(t00); t00 = _mm_srli_si128(t00, 4); *((int*)(dst4)) = _mm_cvtsi128_si32(t00); } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_y_32_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 64)]); int line_size = (bsy >> 1) + bsx - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, bsy - 1); __m128i pad_val; #endif int i; int aligned_line_size = ((line_size + 63) >> 4) << 4; pel_t *pfirst[2]; __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); __m128i shuffle1 = _mm_setr_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); __m128i shuffle2 = _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1); int i_dst2 = i_dst * 2; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 18; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 4; i += 8, src -= 16) { #else for (i = 0; i < real_size - 4; i += 8, src -= 16) { #endif __m128i p00, p01, p10, p11; __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p10 = _mm_add_epi16(p10, coeff2); p10 = _mm_add_epi16(p10, p11); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p10); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m128i p10, p11; __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p10 = _mm_add_epi16(p10, coeff2); p10 = _mm_add_epi16(p10, p11); p10 = _mm_srli_epi16(p10, 2); p11 = _mm_packus_epi16(p10, p10); p10 = _mm_shuffle_epi8(p11, shuffle2); p11 = _mm_shuffle_epi8(p11, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p11); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p10); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { pad_val = _mm_set1_epi8(pfirst[1][real_size - 1]); for (i = real_size; i < line_size; i++) { _mm_storeu_si128((__m128i*)&pfirst[0][i], pad_val); _mm_storeu_si128((__m128i*)&pfirst[1][i], pad_val); } } #endif bsy >>= 1; if (bsx >= 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else { if (bsy == 4) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else { for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_13_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (bsy > 8) { ALIGN16(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) { pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } for (; i < line_size - 8; i += 16, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[3][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[4][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[5][i], p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[6][i], p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[7][i], p00); } if (i < line_size) { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[3][i], p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[4][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[5][i], p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[6][i], p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)&pfirst[7][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsy == 8) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; if (bsx == 32) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst8, p00); src += 16; dst1 += 16; dst2 += 16; dst3 += 16; dst4 += 16; dst5 += 16; dst6 += 16; dst7 += 16; dst8 += 16; S0 = _mm_loadu_si128((__m128i*)(src + 2)); S1 = _mm_loadu_si128((__m128i*)(src + 1)); S2 = _mm_loadu_si128((__m128i*)(src)); S3 = _mm_loadu_si128((__m128i*)(src - 1)); L0 = _mm_unpacklo_epi8(S0, zero); L1 = _mm_unpacklo_epi8(S1, zero); L2 = _mm_unpacklo_epi8(S2, zero); L3 = _mm_unpacklo_epi8(S3, zero); H0 = _mm_unpackhi_epi8(S0, zero); H1 = _mm_unpackhi_epi8(S1, zero); H2 = _mm_unpackhi_epi8(S2, zero); H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff11); p21 = _mm_mullo_epi16(H2, coeff13); p31 = _mm_mullo_epi16(H3, coeff5); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p11 = _mm_mullo_epi16(H1, coeff9); p21 = _mm_mullo_epi16(H2, coeff15); p31 = _mm_mullo_epi16(H3, coeff7); p01 = _mm_add_epi16(H0, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p11 = _mm_mullo_epi16(H2, coeff2); p01 = _mm_add_epi16(H1, H3); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst8, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst4, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff11); p20 = _mm_mullo_epi16(L2, coeff13); p30 = _mm_mullo_epi16(L3, coeff5); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst5, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst6, p00); p10 = _mm_mullo_epi16(L1, coeff9); p20 = _mm_mullo_epi16(L2, coeff15); p30 = _mm_mullo_epi16(L3, coeff7); p00 = _mm_add_epi16(L0, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst7, p00); p10 = _mm_mullo_epi16(L2, coeff2); p00 = _mm_add_epi16(L1, L3); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst8, p00); } } else { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; if (bsx == 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst1, p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst2, p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 5); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)dst4, p00); } else { __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i S3 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst1))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst2))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst3))[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)(dst4))[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_14_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); if (bsy != 4) { ALIGN16(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; __m128i shuffle1 = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m128i shuffle2 = _mm_setr_epi8(1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12); __m128i shuffle3 = _mm_setr_epi8(2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13); __m128i shuffle4 = _mm_setr_epi8(3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14); pel_t *pSrc1 = src; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; for (i = 0; i < left_size - 1; i += 4, src += 16) { __m128i p00, p01, p10, p11; __m128i p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p20 = _mm_shuffle_epi8(p00, shuffle3); p30 = _mm_shuffle_epi8(p00, shuffle4); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p30); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p20); ((int*)&pfirst[2][i])[0] = _mm_cvtsi128_si32(p10); ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00); } if (i < left_size) { //ʹcԿܻ __m128i p00, p01, p10; __m128i p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); p10 = _mm_shuffle_epi8(p00, shuffle2); p20 = _mm_shuffle_epi8(p00, shuffle3); p30 = _mm_shuffle_epi8(p00, shuffle4); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p30); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p20); ((int*)&pfirst[2][i])[0] = _mm_cvtsi128_si32(p10); ((int*)&pfirst[3][i])[0] = _mm_cvtsi128_si32(p00); } src = pSrc1; for (i = left_size; i < line_size; i++, src++) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[2][i], p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)&pfirst[3][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; } } else { if (bsx == 16) { pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst3, p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); p01 = _mm_srli_epi16(p01, 3); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst2, p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); p01 = _mm_srli_epi16(p01, 4); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst, p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_storeu_si128((__m128i*)dst4, p00); } else { pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst3)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)dst2)[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)dst4)[0] = _mm_cvtsi128_si32(p00); } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_16_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + bsy / 2 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle1 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); __m128i shuffle2 = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14); int i; pel_t *pSrc1; UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; pSrc1 = src; for (i = 0; i < left_size - 4; i += 8, src += 16) { __m128i p00, p01, p10, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); p10 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); _mm_storel_epi64((__m128i*)&pfirst[1][i], p00); _mm_storel_epi64((__m128i*)&pfirst[0][i], p10); } if (i < left_size) { __m128i p00, p01; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); p01 = _mm_shuffle_epi8(p00, shuffle2); p00 = _mm_shuffle_epi8(p00, shuffle1); ((int*)&pfirst[1][i])[0] = _mm_cvtsi128_si32(p00); ((int*)&pfirst[0][i])[0] = _mm_cvtsi128_si32(p01); } src = pSrc1 + left_size + left_size; for (i = left_size; i < line_size; i += 16, src += 16) { __m128i p00, p01, p10, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_mullo_epi16(p10, coeff3); p01 = _mm_add_epi16(L0, L3); p11 = _mm_add_epi16(H0, H3); p00 = _mm_add_epi16(p00, coeff4); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 3); p10 = _mm_srli_epi16(p10, 3); p00 = _mm_packus_epi16(p00, p10); _mm_storeu_si128((__m128i*)&pfirst[0][i], p00); p00 = _mm_add_epi16(L0, L1); p01 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H0, H1); p11 = _mm_add_epi16(H1, H2); p00 = _mm_add_epi16(p00, coeff2); p10 = _mm_add_epi16(p10, coeff2); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 2); p10 = _mm_srli_epi16(p10, 2); p00 = _mm_packus_epi16(p00, p10); _mm_storeu_si128((__m128i*)&pfirst[1][i], p00); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst[0] - i); CP32(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst[0] - i); CP64(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += (i_dst << 1); } break; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_18_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); UNUSED_PARAMETER(dir_mode); src -= bsy - 1; for (i = 0; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_store_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst--); dst += i_dst; } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst--); dst += i_dst; } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst--, bsx * sizeof(pel_t)); dst += i_dst; } break; break; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_20_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN16(pel_t first_line[64 + 128]); int left_size = (bsy - 1) * 2 + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i shuffle = _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); pel_t *pSrc1 = src; UNUSED_PARAMETER(dir_mode); src -= bsy; for (i = 0; i < left_size - 16; i += 32, src += 16) { __m128i p00, p01, p10, p11; __m128i p20, p21, p30, p31; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(H1, H2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_mullo_epi16(p10, coeff3); p01 = _mm_add_epi16(L0, L3); p11 = _mm_add_epi16(H0, H3); p00 = _mm_add_epi16(p00, coeff4); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p00, p01); p10 = _mm_add_epi16(p10, p11); p00 = _mm_srli_epi16(p00, 3); p10 = _mm_srli_epi16(p10, 3); p20 = _mm_add_epi16(L1, L2); p30 = _mm_add_epi16(H1, H2); p21 = _mm_add_epi16(L2, L3); p31 = _mm_add_epi16(H2, H3); p20 = _mm_add_epi16(p20, coeff2); p30 = _mm_add_epi16(p30, coeff2); p20 = _mm_add_epi16(p20, p21); p30 = _mm_add_epi16(p30, p31); p20 = _mm_srli_epi16(p20, 2); p30 = _mm_srli_epi16(p30, 2); p00 = _mm_packus_epi16(p00, p20); p10 = _mm_packus_epi16(p10, p30); p00 = _mm_shuffle_epi8(p00, shuffle); p10 = _mm_shuffle_epi8(p10, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); _mm_store_si128((__m128i*)&first_line[i + 16], p10); } if (i < left_size) { __m128i p00, p01; __m128i p20, p21; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p01 = _mm_add_epi16(L0, L3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 3); p20 = _mm_add_epi16(L1, L2); p21 = _mm_add_epi16(L2, L3); p20 = _mm_add_epi16(p20, coeff2); p20 = _mm_add_epi16(p20, p21); p20 = _mm_srli_epi16(p20, 2); p00 = _mm_packus_epi16(p00, p20); p00 = _mm_shuffle_epi8(p00, shuffle); _mm_store_si128((__m128i*)&first_line[i], p00); } src = pSrc1; for (i = left_size; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_storeu_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_22_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); src -= bsy; if (bsx != 4) { ALIGN16(pel_t first_line[64 + 256]); int left_size = (bsy - 1) * 4 + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; pel_t *pSrc1 = src; __m128i zero = _mm_setzero_si128(); __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i shuffle = _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); for (i = 0; i < line_size - 32; i += 64, src += 16) { __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p31); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); M2 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H1, H2); p01 = _mm_mullo_epi16(p01, coeff3); p11 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(p11, coeff4); p01 = _mm_add_epi16(p11, p01); M4 = _mm_srli_epi16(p01, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p11 = _mm_mullo_epi16(H1, coeff5); p21 = _mm_mullo_epi16(H2, coeff7); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(H0, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); p01 = _mm_add_epi16(H1, H2); p11 = _mm_add_epi16(H2, H3); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, coeff2); M8 = _mm_srli_epi16(p01, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); _mm_store_si128((__m128i*)&first_line[i], M3); _mm_store_si128((__m128i*)&first_line[16 + i], M7); _mm_store_si128((__m128i*)&first_line[32 + i], M4); _mm_store_si128((__m128i*)&first_line[48 + i], M8); } if (i < left_size) { __m128i p00, p10, p20, p30; __m128i M1, M3, M5, M7; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); M1 = _mm_srli_epi16(p00, 4); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); M3 = _mm_srli_epi16(p00, 3); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 4); p00 = _mm_add_epi16(L1, L2); p10 = _mm_add_epi16(L2, L3); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); M7 = _mm_srli_epi16(p00, 2); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); _mm_store_si128((__m128i*)&first_line[i], M3); _mm_store_si128((__m128i*)&first_line[16 + i], M7); } src = pSrc1 + bsy; for (i = left_size; i < line_size - 8; i += 16, src += 16) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); __m128i sum3 = _mm_add_epi16(H0, H1); __m128i sum4 = _mm_add_epi16(H1, H2); sum1 = _mm_add_epi16(sum1, sum2); sum3 = _mm_add_epi16(sum3, sum4); sum1 = _mm_add_epi16(sum1, coeff2); sum3 = _mm_add_epi16(sum3, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum3 = _mm_srli_epi16(sum3, 2); sum1 = _mm_packus_epi16(sum1, sum3); _mm_storeu_si128((__m128i*)&first_line[i], sum1); } if (i < line_size) { __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i sum1 = _mm_add_epi16(L0, L1); __m128i sum2 = _mm_add_epi16(L1, L2); sum1 = _mm_add_epi16(sum1, sum2); sum1 = _mm_add_epi16(sum1, coeff2); sum1 = _mm_srli_epi16(sum1, 2); sum1 = _mm_packus_epi16(sum1, sum1); _mm_storel_epi64((__m128i*)&first_line[i], sum1); } switch (bsx) { case 8: while (bsy--) { CP64(dst, pfirst); dst += i_dst; pfirst -= 4; } break; case 16: case 32: case 64: while (bsy--) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } break; default: assert(0); break; } } else { dst += (bsy - 1) * i_dst; for (i = 0; i < bsy; i++, src++) { dst[0] = (src[-1] * 3 + src[0] * 7 + src[1] * 5 + src[2] + 8) >> 4; dst[1] = (src[-1] + (src[0] + src[1]) * 3 + src[2] + 4) >> 3; dst[2] = (src[-1] + src[0] * 5 + src[1] * 7 + src[2] * 3 + 8) >> 4; dst[3] = (src[0] + src[1] * 2 + src[2] + 2) >> 2; dst -= i_dst; } } } /* --------------------------------------------------------------------------- */ void intra_pred_ang_xy_23_sse128(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN16(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; pel_t *pfirst1 = first_line; pel_t *src_org = src; src -= bsy; __m128i zero = _mm_setzero_si128(); __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[-1]); __m128i L1 = _mm_set1_epi16(src[0]); __m128i L2 = _mm_set1_epi16(src[1]); __m128i L3 = _mm_set1_epi16(src[2]); src += 4; for (i = 0; i < left_size + 1; i += 32, src += 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L0 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L1 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L2 = _mm_set1_epi16(src[1]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)pfirst1, p00); pfirst1 += 8; L3 = _mm_set1_epi16(src[2]); } src = src_org + 1; for (; i < line_size; i += 16, src += 16) { coeff2 = _mm_set1_epi16(2); __m128i p00, p10; __m128i p01, p11; __m128i S0 = _mm_loadu_si128((__m128i*)(src)); __m128i S1 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); p00 = _mm_mullo_epi16(L0, coeff2); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_srli_epi16(p00, 2); p01 = _mm_mullo_epi16(H0, coeff2); p11 = _mm_add_epi16(H1, H2); p01 = _mm_add_epi16(p01, coeff2); p01 = _mm_add_epi16(p01, p11); p01 = _mm_srli_epi16(p01, 2); p00 = _mm_packus_epi16(p00, p01); _mm_store_si128((__m128i*)&first_line[i], p00); } for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; } } else if (bsx == 8) { __m128i coeff0 = _mm_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0); __m128i coeff1 = _mm_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1); __m128i coeff2 = _mm_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2); __m128i coeff3 = _mm_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8); __m128i p00, p10, p20, p30; __m128i L0 = _mm_set1_epi16(src[-2]); __m128i L1 = _mm_set1_epi16(src[-1]); __m128i L2 = _mm_set1_epi16(src[0]); __m128i L3 = _mm_set1_epi16(src[1]); src -= 4; bsy >>= 2; for (i = 0; i < bsy; i++, src -= 4) { p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L3 = _mm_set1_epi16(src[1]); p00 = _mm_mullo_epi16(L3, coeff0); p10 = _mm_mullo_epi16(L0, coeff1); p20 = _mm_mullo_epi16(L1, coeff2); p30 = _mm_mullo_epi16(L2, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L2 = _mm_set1_epi16(src[0]); p00 = _mm_mullo_epi16(L2, coeff0); p10 = _mm_mullo_epi16(L3, coeff1); p20 = _mm_mullo_epi16(L0, coeff2); p30 = _mm_mullo_epi16(L1, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L1 = _mm_set1_epi16(src[-1]); p00 = _mm_mullo_epi16(L1, coeff0); p10 = _mm_mullo_epi16(L2, coeff1); p20 = _mm_mullo_epi16(L3, coeff2); p30 = _mm_mullo_epi16(L0, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; L0 = _mm_set1_epi16(src[-2]); } } else { __m128i zero = _mm_setzero_si128(); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i coeff9 = _mm_set1_epi16(9); __m128i coeff11 = _mm_set1_epi16(11); __m128i coeff13 = _mm_set1_epi16(13); __m128i coeff15 = _mm_set1_epi16(15); __m128i coeff16 = _mm_set1_epi16(16); __m128i shuffle = _mm_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); if (bsy == 4) { src -= 15; __m128i p01, p11, p21, p31; __m128i M2, M4, M6, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M4 = _mm_unpacklo_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); } else { src -= 15; __m128i p00, p10, p20, p30; __m128i p01, p11, p21, p31; __m128i M1, M2, M3, M4, M5, M6, M7, M8; __m128i S0 = _mm_loadu_si128((__m128i*)(src - 2)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); __m128i H0 = _mm_unpackhi_epi8(S0, zero); __m128i H1 = _mm_unpackhi_epi8(S1, zero); __m128i H2 = _mm_unpackhi_epi8(S2, zero); __m128i H3 = _mm_unpackhi_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff7); p10 = _mm_mullo_epi16(L1, coeff15); p20 = _mm_mullo_epi16(L2, coeff9); p30 = _mm_add_epi16(L3, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M1 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff7); p11 = _mm_mullo_epi16(H1, coeff15); p21 = _mm_mullo_epi16(H2, coeff9); p31 = _mm_add_epi16(H3, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M2 = _mm_srli_epi16(p01, 5); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M3 = _mm_srli_epi16(p00, 4); p01 = _mm_mullo_epi16(H0, coeff3); p11 = _mm_mullo_epi16(H1, coeff7); p21 = _mm_mullo_epi16(H2, coeff5); p31 = _mm_add_epi16(H3, coeff8); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M4 = _mm_srli_epi16(p01, 4); p00 = _mm_mullo_epi16(L0, coeff5); p10 = _mm_mullo_epi16(L1, coeff13); p20 = _mm_mullo_epi16(L2, coeff11); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff16); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); M5 = _mm_srli_epi16(p00, 5); p01 = _mm_mullo_epi16(H0, coeff5); p11 = _mm_mullo_epi16(H1, coeff13); p21 = _mm_mullo_epi16(H2, coeff11); p31 = _mm_mullo_epi16(H3, coeff3); p01 = _mm_add_epi16(p01, coeff16); p01 = _mm_add_epi16(p01, p11); p01 = _mm_add_epi16(p01, p21); p01 = _mm_add_epi16(p01, p31); M6 = _mm_srli_epi16(p01, 5); p00 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(L1, L2); p10 = _mm_mullo_epi16(p10, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); M7 = _mm_srli_epi16(p00, 3); p01 = _mm_add_epi16(H0, H3); p11 = _mm_add_epi16(H1, H2); p11 = _mm_mullo_epi16(p11, coeff3); p01 = _mm_add_epi16(p01, coeff4); p01 = _mm_add_epi16(p01, p11); M8 = _mm_srli_epi16(p01, 3); M1 = _mm_packus_epi16(M1, M3); M5 = _mm_packus_epi16(M5, M7); M1 = _mm_shuffle_epi8(M1, shuffle); M5 = _mm_shuffle_epi8(M5, shuffle); M2 = _mm_packus_epi16(M2, M4); M6 = _mm_packus_epi16(M6, M8); M2 = _mm_shuffle_epi8(M2, shuffle); M6 = _mm_shuffle_epi8(M6, shuffle); M3 = _mm_unpacklo_epi16(M1, M5); M7 = _mm_unpackhi_epi16(M1, M5); M4 = _mm_unpacklo_epi16(M2, M6); M8 = _mm_unpackhi_epi16(M2, M6); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; M4 = _mm_srli_si128(M4, 4); *((int*)dst) = _mm_cvtsi128_si32(M4); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; M8 = _mm_srli_si128(M8, 4); *((int*)dst) = _mm_cvtsi128_si32(M8); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; M3 = _mm_srli_si128(M3, 4); *((int*)dst) = _mm_cvtsi128_si32(M3); dst += i_dst; *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); dst += i_dst; M7 = _mm_srli_si128(M7, 4); *((int*)dst) = _mm_cvtsi128_si32(M7); } } } #endif // #if !HIGH_BIT_DEPTH davs2-1.6/source/common/vec/intrinsic_intra-pred_avx2.cc000066400000000000000000011622221337322544400233570ustar00rootroot00000000000000/* * intrinsic_intra-pred_avx2.cc * * Description of this file: * AVX2 assembly functions of Intra-Prediction module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include #if !HIGH_BIT_DEPTH void intra_pred_ver_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_ver_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } pel_t *rsrc = src + 1; int i; __m256i S1; if (bsx >= 32) { for (i = 0; i < bsy; i++) { S1 = _mm256_loadu_si256((const __m256i*)(rsrc));//32 _mm256_storeu_si256((__m256i*)(dst), S1); if (32 < bsx) { S1 = _mm256_loadu_si256((const __m256i*)(rsrc + 32));//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { int i, j; __m128i S1; if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_storeu_si128((__m128i*)(dst + j), S1); } S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_maskmoveu_si128(S1, mask, (char *)&dst[j]); dst += i_dst; } } /*{//4/8 for (i = 0; i < bsy; i++) { for (j = 0; j < bsx; j += 4) { S1 = _mm_loadu_si128((const __m128i*)(rsrc + j)); _mm_storeu_si128((__m128i*)(dst + j), S1); } dst += i_dst; } }*/ else { for (i = 0; i < bsy; i++) {//16 S1 = _mm_loadu_si128((const __m128i*)rsrc); _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_hor_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_hor_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int i; pel_t *rsrc = src - 1; __m256i S1; if (bsx >= 32) { for (i = 0; i < bsy; i++) { S1 = _mm256_set1_epi8((char)rsrc[-i]);//32 _mm256_storeu_si256((__m256i*)(dst), S1); if (32 < bsx) {//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { int i, j; __m128i S1; if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_storeu_si128((__m128i*)(dst + j), S1); } S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_maskmoveu_si128(S1, mask, (char*)&dst[j]); dst += i_dst; } } else { for (i = 0; i < bsy; i++) {//16 S1 = _mm_set1_epi8((char)rsrc[-i]); _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_dc_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx <= 8 && bsy <= 8) { // block_sizeС8ʱavx2sse intra_pred_dc_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int bAboveAvail = dir_mode >> 8; int bLeftAvail = dir_mode & 0xFF; int x, y; int iDCValue = 0; pel_t *rsrc = src - 1; __m256i S1; int i; if (bLeftAvail) { for (y = 0; y < bsy; y++) { iDCValue += rsrc[-y]; } rsrc = src + 1; if (bAboveAvail) { for (x = 0; x < bsx; x++) { iDCValue += rsrc[x]; } iDCValue += ((bsx + bsy) >> 1); iDCValue = (iDCValue * (512 / (bsx + bsy))) >> 9; } else { iDCValue += bsy / 2; iDCValue /= bsy; } } else { rsrc = src + 1; if (bAboveAvail) { for (x = 0; x < bsx; x++) { iDCValue += rsrc[x]; } iDCValue += bsx / 2; iDCValue /= bsx; } else { iDCValue = g_dc_value; } } /* for (y = 0; y < bsy; y++) { for (x = 0; x < bsx; x++) { dst[x] = iDCValue; } dst += i_dst; } */ S1 = _mm256_set1_epi8((char)iDCValue); if (bsx >= 32) { for (i = 0; i < bsy; i++) { _mm256_storeu_si256((__m256i*)(dst), S1);//32 if (32 < bsx) {//64 _mm256_storeu_si256((__m256i*)(dst + 32), S1); } dst += i_dst; } } else { __m128i S1; int j; S1 = _mm_set1_epi8((char)iDCValue); if (bsx & 15) {//4/8 __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(bsx & 15) - 1]); for (i = 0; i < bsy; i++) { for (j = 0; j < bsx - 15; j += 16) { _mm_storeu_si128((__m128i*)(dst + j), S1); } _mm_maskmoveu_si128(S1, mask, (char*)&dst[j]); dst += i_dst; } } else { for (i = 0; i < bsy; i++) {//16 _mm_storeu_si128((__m128i*)dst, S1); dst += i_dst; } } } } void intra_pred_plane_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *rpSrc; int iH = 0; int iV = 0; int iA, iB, iC; int x, y; int iW2 = bsx >> 1; int iH2 = bsy >> 1; int ib_mult[5] = { 13, 17, 5, 11, 23 }; int ib_shift[5] = { 7, 10, 11, 15, 19 }; int im_h = ib_mult [tab_log2[bsx] - 2]; int is_h = ib_shift[tab_log2[bsx] - 2]; int im_v = ib_mult [tab_log2[bsy] - 2]; int is_v = ib_shift[tab_log2[bsy] - 2]; int iTmp; UNUSED_PARAMETER(dir_mode); rpSrc = src + iW2; for (x = 1; x < iW2 + 1; x++) { iH += x * (rpSrc[x] - rpSrc[-x]); } rpSrc = src - iH2; for (y = 1; y < iH2 + 1; y++) { iV += y * (rpSrc[-y] - rpSrc[y]); } iA = (src[-1 - (bsy - 1)] + src[1 + bsx - 1]) << 4; iB = ((iH << 5) * im_h + (1 << (is_h - 1))) >> is_h; iC = ((iV << 5) * im_v + (1 << (is_v - 1))) >> is_v; iTmp = iA - (iH2 - 1) * iC - (iW2 - 1) * iB + 16; __m256i TC, TB, TA, T_Start, T, D, D1; __m256i mask ; TA = _mm256_set1_epi16((int16_t)iTmp); TB = _mm256_set1_epi16((int16_t)iB); TC = _mm256_set1_epi16((int16_t)iC); T_Start = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); T_Start = _mm256_mullo_epi16(TB, T_Start); T_Start = _mm256_add_epi16(T_Start, TA); TB = _mm256_mullo_epi16(TB, _mm256_set1_epi16(16)); if (bsx == 4){ mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[3]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); _mm256_maskstore_epi32((int*)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 8) { mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[7]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); _mm256_maskstore_epi64((__int64 *)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else if (bsx == 16) { mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[15]); for (y = 0; y < bsy; y++) { D = _mm256_srai_epi16(T_Start, 5); D = _mm256_packus_epi16(D, D); D = _mm256_permute4x64_epi64(D, 8); _mm256_maskstore_epi64((__int64 *)dst, mask, D); T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } else { //32 64 for (y = 0; y < bsy; y++) { T = T_Start; for (x = 0; x < bsx; x += 32) { D = _mm256_srai_epi16(T, 5); T = _mm256_add_epi16(T, TB); D1 = _mm256_srai_epi16(T, 5); D = _mm256_packus_epi16(D, D1); D = _mm256_permute4x64_epi64(D, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + x), D); T = _mm256_add_epi16(T, TB); } T_Start = _mm256_add_epi16(T_Start, TC); dst += i_dst; } } } void intra_pred_bilinear_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int x, y; int ishift_x = tab_log2[bsx]; int ishift_y = tab_log2[bsy]; int ishift = DAVS2_MIN(ishift_x, ishift_y); int ishift_xy = ishift_x + ishift_y + 1; int offset = 1 << (ishift_x + ishift_y); int a, b, c, t, val; pel_t *p; __m256i T, T1, T2, T3, C1, C2, ADD; /* TODO: Ϊʲô⼸ĴСҪӵ 32ǷбҪ */ ALIGN32(itr_t pTop[MAX_CU_SIZE + 32]); ALIGN32(itr_t pLeft[MAX_CU_SIZE + 32]); ALIGN32(itr_t pT[MAX_CU_SIZE + 32]); ALIGN32(itr_t pL[MAX_CU_SIZE + 32]); ALIGN32(itr_t wy[MAX_CU_SIZE + 32]); UNUSED_PARAMETER(dir_mode); p = src + 1; __m256i ZERO = _mm256_setzero_si256(); for (x = 0; x < bsx; x += 32) { T = _mm256_loadu_si256((__m256i*)(p + x));//8bit 32 T1 = _mm256_unpacklo_epi8(T, ZERO); //0 2 T2 = _mm256_unpackhi_epi8(T, ZERO); //1 3 T = _mm256_permute2x128_si256(T1, T2, 0x0020); _mm256_store_si256((__m256i*)(pTop + x), T); T = _mm256_permute2x128_si256(T1, T2, 0x0031); _mm256_store_si256((__m256i*)(pTop + x + 16), T); } for (y = 0; y < bsy; y++) { pLeft[y] = src[-1 - y]; } //p = src + 1; //for (x = 0; x < bsx; x++) { // pTop[x] = p[x]; //} //p = src - 1; //for (y = 0; y < bsy; y++) { // pLeft[y] = p[-y]; //} a = pTop[bsx - 1]; b = pLeft[bsy - 1]; if (bsx == bsy) { c = (a + b + 1) >> 1; } else { c = (((a << ishift_x) + (b << ishift_y)) * 13 + (1 << (ishift + 5))) >> (ishift + 6); } t = (c << 1) - a - b; T = _mm256_set1_epi16((int16_t)b); for (x = 0; x < bsx; x += 16) { T1 = _mm256_loadu_si256((__m256i*)(pTop + x)); T2 = _mm256_sub_epi16(T, T1); T1 = _mm256_slli_epi16(T1, ishift_y); _mm256_store_si256((__m256i*)(pT + x), T2); _mm256_store_si256((__m256i*)(pTop + x), T1); } T = _mm256_set1_epi16((int16_t)a); for (y = 0; y < bsy; y += 16) { T1 = _mm256_loadu_si256((__m256i*)(pLeft + y)); T2 = _mm256_sub_epi16(T, T1); T1 = _mm256_slli_epi16(T1, ishift_x); _mm256_store_si256((__m256i*)(pL + y), T2); _mm256_store_si256((__m256i*)(pLeft + y), T1); } T = _mm256_set1_epi16((int16_t)t); T = _mm256_mullo_epi16(T, _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); T1 = _mm256_set1_epi16((int16_t)(16 * t)); for (y = 0; y < bsy; y += 16) { _mm256_store_si256((__m256i*)(wy + y), T); T = _mm256_add_epi16(T, T1); } C1 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); C2 = _mm256_set1_epi32(8); if (bsx == 4) { __m256i pTT = _mm256_loadu_si256((__m256i*)pT); T = _mm256_loadu_si256((__m256i*)pTop); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); T = _mm256_add_epi16(T, pTT); T1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(T, 0)); T1 = _mm256_slli_epi32(T1, ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy); T1 = _mm256_packus_epi32(T1, T1); T1 = _mm256_packus_epi16(T1, T1); _mm256_maskstore_epi32((int*)dst, mask, T1); dst += i_dst; } } else if (bsx == 8) { __m256i pTT = _mm256_load_si256((__m256i*)pT); T = _mm256_load_si256((__m256i*)pTop); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); T = _mm256_add_epi16(T, pTT); T1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(T, 0)); T1 = _mm256_slli_epi32(T1, ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy); //mask //T1 is the result T1 = _mm256_packus_epi32(T1, T1); //1 2 3 4 1 2 3 4 5 6 7 8 5 6 7 8 T1 = _mm256_permute4x64_epi64(T1, 0x0008); T1 = _mm256_packus_epi16(T1, T1); _mm256_maskstore_epi64((__int64 *)dst, mask, T1); dst += i_dst; } } else { __m256i TT[8]; __m256i PTT[8]; __m256i temp1, temp2; __m256i mask1 = _mm256_set_epi32(3, 2, 1, 0, 5, 1, 4, 0); __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (x = 0; x < bsx; x += 16) { int idx = x >> 3; __m256i M0 = _mm256_loadu_si256((__m256i*)(pTop + x)); //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i M1 = _mm256_loadu_si256((__m256i*)(pT + x)); temp1 = _mm256_unpacklo_epi16(M0, ZERO); //0 1 2 3 8 9 10 11 temp2 = _mm256_unpackhi_epi16(M0, ZERO); //4 5 6 7 12 13 14 15 TT[idx] = _mm256_permute2x128_si256(temp1, temp2, 0x0020); //0 1 2 3 4 5 6 7 TT[idx + 1] = _mm256_permute2x128_si256(temp1, temp2, 0x0031); //8 9 10 11 12 13 14 15 PTT[idx] = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(M1, 0)); PTT[idx + 1] = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(M1, 1)); } for (y = 0; y < bsy; y++) { int add = (pL[y] << ishift_y) + wy[y]; ADD = _mm256_set1_epi32(add); T3 = _mm256_mullo_epi32(C2, ADD); ADD = _mm256_mullo_epi32(C1, ADD); val = (pLeft[y] << ishift_y) + offset + (pL[y] << ishift_y); ADD = _mm256_add_epi32(ADD, _mm256_set1_epi32(val)); for (x = 0; x < bsx; x += 16) { int idx = x >> 3; TT[idx] = _mm256_add_epi32(TT[idx], PTT[idx]); //0 1 2 3 4 5 6 7 TT[idx + 1] = _mm256_add_epi32(TT[idx + 1], PTT[idx + 1]); //8 9 10 11 12 13 14 15 T1 = _mm256_slli_epi32(TT[idx], ishift_x); T2 = _mm256_slli_epi32(TT[idx + 1], ishift_x); T1 = _mm256_add_epi32(T1, ADD); T1 = _mm256_srai_epi32(T1, ishift_xy);//0 1 2 3 4 5 6 7 ADD = _mm256_add_epi32(ADD, T3); T2 = _mm256_add_epi32(T2, ADD); T2 = _mm256_srai_epi32(T2, ishift_xy);//8 9 10 11 12 13 14 15 //T1 T2 is the result T1 = _mm256_packus_epi32(T1, T2); //0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 T1 = _mm256_packus_epi16(T1, T1); //0 1 2 3 8 9 10 11 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 4 5 6 7 12 13 14 15 T1 = _mm256_permutevar8x32_epi32(T1, mask1); //store 128 bits _mm256_maskstore_epi64((__int64 *)(dst + x), mask2, T1); ADD = _mm256_add_epi32(ADD, T3); } dst += i_dst; } } } void intra_pred_ang_x_3_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if ((bsy > 4) && (bsx > 8)) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); ALIGN32(pel_t first_line[(64 + 176 + 16) << 2]); int line_size = bsx + (((bsy - 4) * 11) >> 2); #if !BUGFIX_PREDICTION_INTRA int iW2 = bsx * 2 - 1; int real_size = DAVS2_MIN(line_size, iW2 + 1); #endif int aligned_line_size = 64 + 176 + 16; int i; pel_t *pfirst[4]; #if !BUGFIX_PREDICTION_INTRA pel_t *src_org = src; #endif pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i SS2, SS11; __m256i L2, L3, L4, L5, L6, L7, L8, L9, L10, L11, L12, L13; __m256i H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12, H13; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src += 32) { #else for (i = 0; i < real_size - 16; i += 32, src += 32) { #endif SS2 = _mm256_loadu_si256((__m256i*)(src + 2));//2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//2...17 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//18...34 SS2 = _mm256_loadu_si256((__m256i*)(src + 3));//3 4 5 6 7 8 9 10 11 12 13 14 15 L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//3...18 H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//19...35 SS2 = _mm256_loadu_si256((__m256i*)(src + 4));//4 5 6 7 8 9 10 11 12 13 14 15 L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//4 H4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//20 SS2 = _mm256_loadu_si256((__m256i*)(src + 5));//5 6 7 8 9 10 11 12 13 14 15 L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//5 H5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//21 SS2 = _mm256_loadu_si256((__m256i*)(src + 6));//6 7 8 9 10 11 12 13 14 15 L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0));//6 H6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1));//22 SS2 = _mm256_loadu_si256((__m256i*)(src + 7));//7 8 9 10 11 12 13 14 15 L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8));//8 9 10 11 12 13 14 15 L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9));//9 10 11 12 13 14 15 L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10));//10 11 12 13 14 15 L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 11));//11 12 13 14 15 16 17 18 19 20 21 22 23 L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 12));//12 13 14 15 16 17 18 19 20... L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); SS2 = _mm256_loadu_si256((__m256i*)(src + 13));//13 ...28 29...44 L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); H13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 1)); p00 = _mm256_add_epi16(L2, coeff8);//2 ...17 p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_add_epi16(H2, coeff8); p11 = _mm256_mullo_epi16(H3, coeff5); p21 = _mm256_mullo_epi16(H4, coeff7); p31 = _mm256_mullo_epi16(H5, coeff3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H5, H8); p11 = _mm256_add_epi16(H6, H7); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H8, coeff3); p11 = _mm256_mullo_epi16(H9, coeff7); p21 = _mm256_mullo_epi16(H10, coeff5); p31 = _mm256_add_epi16(H11, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H11, H13); p11 = _mm256_mullo_epi16(H12, coeff2); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif SS2 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); p00 = _mm256_add_epi16(L2, coeff8); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[3][i], mask, p00); } bsy >>= 2; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst4, mask, M); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; }*/ } else if (bsx == 16) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i p00, p10, p20, p30; __m256i SS2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 4)); __m256i L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 5)); __m256i L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 6)); __m256i L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 7)); __m256i L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 8)); __m256i L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 9)); __m256i L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); SS2 = _mm256_loadu_si256((__m256i*)(src + 10)); __m256i L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS2, 0)); __m256i SS11 = _mm256_loadu_si256((__m256i*)(src + 11)); __m256i L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 12)); __m256i L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); SS11 = _mm256_loadu_si256((__m256i*)(src + 13)); __m256i L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS11, 0)); p00 = _mm256_add_epi16(L2, coeff8); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)dst1, mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst2, mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst3, mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_mullo_epi16(L12, coeff2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst4, mask, p00); } else { //8x8 8x32 4x16 4x4 intra_pred_ang_x_3_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_4_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsx != bsy && bsx < bsy){ intra_pred_ang_x_4_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } ALIGN32(pel_t first_line[64 + 128]); int line_size = bsx + ((bsy - 1) << 1); int iHeight2 = bsy << 1; int i; __m256i zero = _mm256_setzero_si256(); __m256i offset = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src += 32) { #else for (i = 0; i < real_size - 16; i += 32, src += 32) { #endif //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i sum1 = _mm256_add_epi16(tmp0, tmp1); __m256i sum2 = _mm256_add_epi16(tmp1, tmp2); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i sum3 = _mm256_add_epi16(tmp0, tmp1); __m256i sum4 = _mm256_add_epi16(tmp1, tmp2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, offset); sum3 = _mm256_add_epi16(sum3, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3);//0 2 1 3 sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i sum1 = _mm256_add_epi16(L0, L1); __m256i sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x0008); //store 128 bit __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)(first_line + i), mask2, sum1); //_mm_storel_epi64((__m128i*)&first_line[i], sum1); } if (bsx == 64){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i]+32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 4] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 6] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx == bsy || bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else { __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 2); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } }*/ } void intra_pred_ang_x_5_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i coeff9 = _mm256_set1_epi16(9); __m256i coeff11 = _mm256_set1_epi16(11); __m256i coeff13 = _mm256_set1_epi16(13); __m256i coeff15 = _mm256_set1_epi16(15); __m256i coeff16 = _mm256_set1_epi16(16); UNUSED_PARAMETER(dir_mode); int i; if (((bsy > 4) && (bsx > 8))) { ALIGN32(pel_t first_line[(64 + 80 + 16) << 3]); int line_size = bsx + ((bsy - 8) >> 3) * 11; #if !BUGFIX_PREDICTION_INTRA int iW2 = bsx * 2 - 1; int real_size = DAVS2_MIN(line_size, iW2 + 1); #endif int aligned_line_size = (((line_size + 15) >> 4) << 4) + 16; pel_t *pfirst[8]; #if !BUGFIX_PREDICTION_INTRA pel_t *src_org = src; #endif pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; pel_t *dst5 = dst4 + i_dst; pel_t *dst6 = dst5 + i_dst; pel_t *dst7 = dst6 + i_dst; pel_t *dst8 = dst7 + i_dst; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i SS1; __m256i L1, L2, L3, L4, L5, L6, L7, L8, L9, L10, L11, L12, L13; __m256i H1, H2, H3, H4, H5, H6, H7, H8, H9, H10, H11, H12, H13; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src += 32) { #else for (i = 0; i < real_size - 16; i += 32, src += 32) { #endif SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//17 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//18 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//19 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 H4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1));//20 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); SS1 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); H13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 1)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H1, coeff5); p11 = _mm256_mullo_epi16(H2, coeff13); p21 = _mm256_mullo_epi16(H3, coeff11); p31 = _mm256_mullo_epi16(H4, coeff3); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H3, coeff5); p21 = _mm256_mullo_epi16(H4, coeff7); p31 = _mm256_mullo_epi16(H5, coeff3); p01 = _mm256_add_epi16(H2, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H4, coeff7); p11 = _mm256_mullo_epi16(H5, coeff15); p21 = _mm256_mullo_epi16(H6, coeff9); p31 = _mm256_add_epi16(H7, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H5, H8); p11 = _mm256_add_epi16(H6, H7); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); p00 = _mm256_add_epi16(L6, coeff16); p10 = _mm256_mullo_epi16(L7, coeff9); p20 = _mm256_mullo_epi16(L8, coeff15); p30 = _mm256_mullo_epi16(L9, coeff7); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_add_epi16(H6, coeff16); p11 = _mm256_mullo_epi16(H7, coeff9); p21 = _mm256_mullo_epi16(H8, coeff15); p31 = _mm256_mullo_epi16(H9, coeff7); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[4][i], p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H8, coeff3); p11 = _mm256_mullo_epi16(H9, coeff7); p21 = _mm256_mullo_epi16(H10, coeff5); p31 = _mm256_add_epi16(H11, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[5][i], p00); p00 = _mm256_mullo_epi16(L9, coeff3); p10 = _mm256_mullo_epi16(L10, coeff11); p20 = _mm256_mullo_epi16(L11, coeff13); p30 = _mm256_mullo_epi16(L12, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H9, coeff3); p11 = _mm256_mullo_epi16(H10, coeff11); p21 = _mm256_mullo_epi16(H11, coeff13); p31 = _mm256_mullo_epi16(H12, coeff5); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[6][i], p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_add_epi16(L12, L12); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H11, H13); p11 = _mm256_add_epi16(H12, H12); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[7][i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 9)); L9 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 10)); L10 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 11)); L11 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 12)); L12 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 13)); L13 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[3][i], mask, p00); p00 = _mm256_add_epi16(L6, coeff16); p10 = _mm256_mullo_epi16(L7, coeff9); p20 = _mm256_mullo_epi16(L8, coeff15); p30 = _mm256_mullo_epi16(L9, coeff7); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[4][i], mask, p00); p00 = _mm256_mullo_epi16(L8, coeff3); p10 = _mm256_mullo_epi16(L9, coeff7); p20 = _mm256_mullo_epi16(L10, coeff5); p30 = _mm256_add_epi16(L11, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[5][i], mask, p00); p00 = _mm256_mullo_epi16(L9, coeff3); p10 = _mm256_mullo_epi16(L10, coeff11); p20 = _mm256_mullo_epi16(L11, coeff13); p30 = _mm256_mullo_epi16(L12, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[6][i], mask, p00); p00 = _mm256_add_epi16(L11, L13); p10 = _mm256_add_epi16(L12, L12); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[7][i], mask, p00); } bsy >>= 3; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_storeu_si256((__m256i*)dst5, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst5 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_storeu_si256((__m256i*)dst6, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst6 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_storeu_si256((__m256i*)dst7, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst7 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_storeu_si256((__m256i*)dst8, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11 + 32)); _mm256_storeu_si256((__m256i*)(dst8 + 32), M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_storeu_si256((__m256i*)dst5, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_storeu_si256((__m256i*)dst6, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_storeu_si256((__m256i*)dst7, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_storeu_si256((__m256i*)dst8, M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst4, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst5, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst6, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst7, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] + i * 11)); _mm256_maskstore_epi64((__int64 *)dst8, mask, M); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i * 11, bsx * sizeof(pel_t)); memcpy(dst2, pfirst[1] + i * 11, bsx * sizeof(pel_t)); memcpy(dst3, pfirst[2] + i * 11, bsx * sizeof(pel_t)); memcpy(dst4, pfirst[3] + i * 11, bsx * sizeof(pel_t)); memcpy(dst5, pfirst[4] + i * 11, bsx * sizeof(pel_t)); memcpy(dst6, pfirst[5] + i * 11, bsx * sizeof(pel_t)); memcpy(dst7, pfirst[6] + i * 11, bsx * sizeof(pel_t)); memcpy(dst8, pfirst[7] + i * 11, bsx * sizeof(pel_t)); dst1 = dst8 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; dst5 = dst4 + i_dst; dst6 = dst5 + i_dst; dst7 = dst6 + i_dst; dst8 = dst7 + i_dst; }*/ } else if (bsx == 16) { pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m256i p00, p10, p20, p30; __m256i SS1; __m256i L1, L2, L3, L4, L5, L6, L7, L8; SS1 = _mm256_loadu_si256((__m256i*)(src + 1));//1...8 9...16 17..24 25..32 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//1 SS1 = _mm256_loadu_si256((__m256i*)(src + 2)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//2 SS1 = _mm256_loadu_si256((__m256i*)(src + 3)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//3 SS1 = _mm256_loadu_si256((__m256i*)(src + 4)); L4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0));//4 SS1 = _mm256_loadu_si256((__m256i*)(src + 5)); L5 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 6)); L6 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 7)); L7 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); SS1 = _mm256_loadu_si256((__m256i*)(src + 8)); L8 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(SS1, 0)); p00 = _mm256_mullo_epi16(L1, coeff5); p10 = _mm256_mullo_epi16(L2, coeff13); p20 = _mm256_mullo_epi16(L3, coeff11); p30 = _mm256_mullo_epi16(L4, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)dst1, mask, p00); p10 = _mm256_mullo_epi16(L3, coeff5); p20 = _mm256_mullo_epi16(L4, coeff7); p30 = _mm256_mullo_epi16(L5, coeff3); p00 = _mm256_add_epi16(L2, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst2, mask, p00); p00 = _mm256_mullo_epi16(L4, coeff7); p10 = _mm256_mullo_epi16(L5, coeff15); p20 = _mm256_mullo_epi16(L6, coeff9); p30 = _mm256_add_epi16(L7, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst3, mask, p00); p00 = _mm256_add_epi16(L5, L8); p10 = _mm256_add_epi16(L6, L7); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst4, mask, p00); } else { //8x8 8x32 4x4 4x16 intra_pred_ang_x_5_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_6_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; __m256i zero = _mm256_setzero_si256(); __m256i offset = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); src += 2; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src += 32) { #else for (i = 0; i < real_size - 16; i += 32, src += 32) { #endif //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i sum1 = _mm256_add_epi16(tmp0, tmp1); __m256i sum2 = _mm256_add_epi16(tmp1, tmp2); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i sum3 = _mm256_add_epi16(tmp0, tmp1); __m256i sum4 = _mm256_add_epi16(tmp1, tmp2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, offset); sum3 = _mm256_add_epi16(sum3, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3);//0 2 1 3 sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i sum1 = _mm256_add_epi16(L0, L1); __m256i sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, offset); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x0008); //store 128 bit __m256i mask2 = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)(first_line + i), mask2, sum1); //_mm_storel_epi64((__m128i*)&first_line[i], sum1); } #if !BUGFIX_PREDICTION_INTRA // padding for (i = real_size; i < line_size; i += 32) { __m256i pad = _mm256_set1_epi8(first_line[real_size - 1]); _mm256_storeu_si256((__m256i*)&first_line[i], pad); } #endif if (bsx == 64){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 1] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 3] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 1]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 3]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /* if (bsx == bsy || bsx >= 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else {//8x32 4x16 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_srli_si256(M, 1); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } }*/ } void intra_pred_ang_x_7_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx <= 8) {//4x4 8x8 intra_pred_ang_x_7_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){//16 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm256_loadu_si256((__m256i*)tab_coeff_mode_7_avx[j]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64 *)dst, mask, D0); dst += i_dst; } } else {//32 64 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_7[j]; c0 = _mm256_loadu_si256((__m256i*)tab_coeff_mode_7_avx[j]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } else { intra_pred_ang_x_7_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_8_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + (bsy >> 1) - 1; #if !BUGFIX_PREDICTION_INTRA int real_size = DAVS2_MIN(line_size, (bsx << 1)); #endif int i; #if !BUGFIX_PREDICTION_INTRA __m128i pad1, pad2; #endif int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; __m256i zero = _mm256_setzero_si256(); __m256i coeff = _mm256_set1_epi16(3); //16 __m256i offset1 = _mm256_set1_epi16(4); __m256i offset2 = _mm256_set1_epi16(2); UNUSED_PARAMETER(dir_mode); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; __m256i p01, p02, p11, p12; __m256i p21, p22, p31, p32; __m256i tmp0, tmp1, tmp2, tmp3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 16; i += 32, src += 32) { #else for (i = 0; i < real_size - 16; i += 32, src += 32) { #endif //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);//8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i H3 = _mm256_unpackhi_epi8(S3, zero); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0020); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0020); tmp3 = _mm256_permute2x128_si256(L3, H3, 0x0020); p01 = _mm256_add_epi16(tmp1, tmp2); p01 = _mm256_mullo_epi16(p01, coeff); p02 = _mm256_add_epi16(tmp0, tmp3); p02 = _mm256_add_epi16(p02, offset1); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_srli_epi16(p01, 3); // //prepare for next line p21 = _mm256_add_epi16(tmp1, tmp2); p22 = _mm256_add_epi16(tmp2, tmp3); tmp0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17....24 25.... tmp1 = _mm256_permute2x128_si256(L1, H1, 0x0031); tmp2 = _mm256_permute2x128_si256(L2, H2, 0x0031); tmp3 = _mm256_permute2x128_si256(L3, H3, 0x0031); p11 = _mm256_add_epi16(tmp1, tmp2); p11 = _mm256_mullo_epi16(p11, coeff); p12 = _mm256_add_epi16(tmp0, tmp3); p12 = _mm256_add_epi16(p12, offset1); p11 = _mm256_add_epi16(p11, p12); p11 = _mm256_srli_epi16(p11, 3); //prepare for next line p31 = _mm256_add_epi16(tmp1, tmp2); p32 = _mm256_add_epi16(tmp2, tmp3); p01 = _mm256_packus_epi16(p01, p11); p01 = _mm256_permute4x64_epi64(p01, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p01); p21 = _mm256_add_epi16(p21, p22); p31 = _mm256_add_epi16(p31, p32); p21 = _mm256_add_epi16(p21, offset2); p31 = _mm256_add_epi16(p31, offset2); p21 = _mm256_srli_epi16(p21, 2); p31 = _mm256_srli_epi16(p31, 2); p21 = _mm256_packus_epi16(p21, p31); p21 = _mm256_permute4x64_epi64(p21, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p21); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); p01 = _mm256_add_epi16(L1, L2); p01 = _mm256_mullo_epi16(p01, coeff); p02 = _mm256_add_epi16(L0, L3); p02 = _mm256_add_epi16(p02, offset1); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_srli_epi16(p01, 3); p01 = _mm256_packus_epi16(p01, p01); p01 = _mm256_permute4x64_epi64(p01, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p01); p01 = _mm256_add_epi16(L1, L2); p02 = _mm256_add_epi16(L2, L3); p01 = _mm256_add_epi16(p01, p02); p01 = _mm256_add_epi16(p01, offset2); p01 = _mm256_srli_epi16(p01, 2); p01 = _mm256_packus_epi16(p01, p01); p01=_mm256_permute4x64_epi64(p01,0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p01); } #if !BUGFIX_PREDICTION_INTRA // padding if (real_size < line_size) { pfirst[1][real_size - 1] = pfirst[1][real_size - 2]; pad1 = _mm256_set1_epi8(pfirst[0][real_size - 1]); pad2 = _mm256_set1_epi8(pfirst[1][real_size - 1]); for (i = real_size; i < line_size; i += 32) { _mm256_storeu_si256((__m256i*)&pfirst[0][i], pad1); _mm256_storeu_si256((__m256i*)&pfirst[1][i], pad2); } } #endif bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx != 8) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else if (bsy == 4) {//8x8 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); __m256i M1 = _mm256_loadu_si256((__m256i*)&pfirst[0][0]); __m256i M2 = _mm256_loadu_si256((__m256i*)&pfirst[1][0]); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); } else { //8x32 __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < 16; i = i + 4) { __m256i M1 = _mm256_loadu_si256((__m256i*)&pfirst[0][i]); __m256i M2 = _mm256_loadu_si256((__m256i*)&pfirst[1][i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; M1 = _mm256_srli_si256(M1, 1); M2 = _mm256_srli_si256(M2, 1); _mm256_maskstore_epi64((__int64 *)dst, mask, M1); _mm256_maskstore_epi64((__int64 *)(dst + i_dst), mask, M2); dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; //M1 = _mm256_srli_si256(M1, 1); //M2 = _mm256_srli_si256(M2, 1); //_mm256_maskstore_epi64((__m256i*)dst, mask, M1); //_mm256_maskstore_epi64((__m256i*)(dst + i_dst), mask, M2); //dst += i_dst2; } }*/ } void intra_pred_ang_x_9_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy) { if (bsx & 0x07) {//4 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 0x0f) {//8 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){//16 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm256_set1_epi32(((int*)(tab_coeff_mode_9[j]))[0]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64 *)dst, mask, D0); dst += i_dst; } } else {//32 64 __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = tab_idx_mode_9[j]; c0 = _mm256_set1_epi32(((int*)tab_coeff_mode_9[j])[0]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } else {//4x16 8x32 intra_pred_ang_x_9_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_x_10_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { if (bsy == 4){ intra_pred_ang_x_10_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } int i; pel_t *dst1 = dst; pel_t *dst2 = dst1 + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; UNUSED_PARAMETER(dir_mode); if (bsy != 4) { __m256i zero = _mm256_setzero_si256(); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); ALIGN32(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; for (i = 0; i < line_size - 16; i += 32, src += 32) { __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; //0 1 2 3 .... 12 13 14 15 16 17 18 19 .... 28 29 30 21 __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i L0 = _mm256_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); __m256i H0 = _mm256_unpackhi_epi8(S0, zero);// 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31 __m256i H1 = _mm256_unpackhi_epi8(S1, zero); __m256i H2 = _mm256_unpackhi_epi8(S2, zero); __m256i H3 = _mm256_unpackhi_epi8(S3, zero); __m256i tmpL0 = _mm256_permute2x128_si256(L0, H0, 0x0020);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 __m256i tmpL1 = _mm256_permute2x128_si256(L1, H1, 0x0020); __m256i tmpL2 = _mm256_permute2x128_si256(L2, H2, 0x0020); __m256i tmpL3 = _mm256_permute2x128_si256(L3, H3, 0x0020); __m256i tmpH0 = _mm256_permute2x128_si256(L0, H0, 0x0031);//16 17...24 25... __m256i tmpH1 = _mm256_permute2x128_si256(L1, H1, 0x0031); __m256i tmpH2 = _mm256_permute2x128_si256(L2, H2, 0x0031); __m256i tmpH3 = _mm256_permute2x128_si256(L3, H3, 0x0031); p00 = _mm256_mullo_epi16(tmpL0, coeff3);//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 p10 = _mm256_mullo_epi16(tmpL1, coeff7); p20 = _mm256_mullo_epi16(tmpL2, coeff5); p30 = _mm256_add_epi16(tmpL3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(tmpH0, coeff3);//16 17...24 25... p11 = _mm256_mullo_epi16(tmpH1, coeff7); p21 = _mm256_mullo_epi16(tmpH2, coeff5); p31 = _mm256_add_epi16(tmpH3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(tmpL1, tmpL2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(tmpL0, tmpL3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(tmpH1, tmpH2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(tmpH0, tmpH3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p10 = _mm256_mullo_epi16(tmpL1, coeff5); p20 = _mm256_mullo_epi16(tmpL2, coeff7); p30 = _mm256_mullo_epi16(tmpL3, coeff3); p00 = _mm256_add_epi16(tmpL0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(tmpH1, coeff5); p21 = _mm256_mullo_epi16(tmpH2, coeff7); p31 = _mm256_mullo_epi16(tmpH3, coeff3); p01 = _mm256_add_epi16(tmpH0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(tmpL1, tmpL2); p10 = _mm256_add_epi16(tmpL2, tmpL3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(tmpH1, tmpH2); p11 = _mm256_add_epi16(tmpH2, tmpH3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } if (i < line_size) { __m256i p00, p10, p20, p30; __m256i S0 = _mm256_loadu_si256((__m256i*)(src)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 3)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src + 1)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 2)); S0 = _mm256_permute4x64_epi64(S0, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S1 = _mm256_permute4x64_epi64(S1, 0x00D8); __m256i L0 = _mm256_unpacklo_epi8(S0, zero); __m256i L1 = _mm256_unpacklo_epi8(S1, zero); __m256i L2 = _mm256_unpacklo_epi8(S2, zero); __m256i L3 = _mm256_unpacklo_epi8(S3, zero); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[3][i], mask, p00); } bsy >>= 2; int i_dstx4 = i_dst << 2; if (bsx == 64){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst1 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst2 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i + 32)); _mm256_storeu_si256((__m256i*)(dst3 + 32), M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_storeu_si256((__m256i*)dst4, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i + 32)); _mm256_storeu_si256((__m256i*)(dst4 + 32), M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 32){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst1, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst2, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_storeu_si256((__m256i*)dst3, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_storeu_si256((__m256i*)dst4, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi64((__int64 *)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi64((__int64 *)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi64((__int64 *)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi64((__int64 *)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst1, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst2, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] + i)); _mm256_maskstore_epi32((int*)dst3, mask, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] + i)); _mm256_maskstore_epi32((int*)dst4, mask, M); dst1 += i_dstx4; dst2 += i_dstx4; dst3 += i_dstx4; dst4 += i_dstx4; } } /* if (bsx != 8) { switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst1, pfirst[0] + i); dst1 += i_dstx4; CP32(dst2, pfirst[1] + i); dst2 += i_dstx4; CP32(dst3, pfirst[2] + i); dst3 += i_dstx4; CP32(dst4, pfirst[3] + i); dst4 += i_dstx4; } break; case 16: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 16 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 16 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 16 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 16 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 32: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 32 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 32 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 32 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 32 * sizeof(pel_t)); dst4 += i_dstx4; } break; case 64: for (i = 0; i < bsy; i++) { memcpy(dst1, pfirst[0] + i, 64 * sizeof(pel_t)); dst1 += i_dstx4; memcpy(dst2, pfirst[1] + i, 64 * sizeof(pel_t)); dst2 += i_dstx4; memcpy(dst3, pfirst[2] + i, 64 * sizeof(pel_t)); dst3 += i_dstx4; memcpy(dst4, pfirst[3] + i, 64 * sizeof(pel_t)); dst4 += i_dstx4; } break; default: assert(0); break; } } else { if (bsy == 2) { //8x8 for (i = 0; i < bsy; i++) { CP64(dst1, pfirst[0] + i); CP64(dst2, pfirst[1] + i); CP64(dst3, pfirst[2] + i); CP64(dst4, pfirst[3] + i); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; } } else {//8x32 __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); __m128i M3 = _mm_loadu_si128((__m128i*)&pfirst[2][0]); __m128i M4 = _mm_loadu_si128((__m128i*)&pfirst[3][0]); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); dst1 = dst4 + i_dst; dst2 = dst1 + i_dst; dst3 = dst2 + i_dst; dst4 = dst3 + i_dst; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); M3 = _mm_srli_si128(M3, 1); M4 = _mm_srli_si128(M4, 1); _mm_storel_epi64((__m128i*)dst1, M1); _mm_storel_epi64((__m128i*)dst2, M2); _mm_storel_epi64((__m128i*)dst3, M3); _mm_storel_epi64((__m128i*)dst4, M4); } }*/ } } void intra_pred_ang_x_11_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i, j; UNUSED_PARAMETER(dir_mode); if (bsx & 0x07) { intra_pred_ang_x_11_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 0x0f) { intra_pred_ang_x_11_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } else if (bsx & 16){ __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0; __m256i off = _mm256_set1_epi16(64); __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (j = 0; j < bsy; j++) { int idx = (j + 1) >> 3; c0 = _mm256_set1_epi32(((int*)(tab_coeff_mode_11[j & 0x07]))[0]); S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//0...7 t3 = _mm256_unpackhi_epi16(t0, t1);//8...15 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); D0 = _mm256_packus_epi16(D0, D0); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_maskstore_epi64((__int64 *)dst, mask, D0); dst += i_dst; } } else { __m256i S0, S1, S2, S3; __m256i t0, t1, t2, t3; __m256i c0; __m256i D0, D1; __m256i off = _mm256_set1_epi16(64); for (j = 0; j < bsy; j++) { int idx = (j + 1) >> 3; c0 = _mm256_set1_epi32(((int*)tab_coeff_mode_11[j & 0x07])[0]); for (i = 0; i < bsx; i += 32, idx += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + idx)); //0...7 8...15 16...23 24...31 S1 = _mm256_loadu_si256((__m256i*)(src + idx + 1));//1.. 8 9...16 17...24 25...32 S2 = _mm256_loadu_si256((__m256i*)(src + idx + 2));//2...9 10...17 18 S3 = _mm256_loadu_si256((__m256i*)(src + idx + 3));//3...10 11...18 19 S0 = _mm256_permute4x64_epi64(S0, 0x00D8);//0...7 16...23 8...15 24...31 S1 = _mm256_permute4x64_epi64(S1, 0x00D8);//1...8 17...24 9...16 25...32 S2 = _mm256_permute4x64_epi64(S2, 0x00D8); S3 = _mm256_permute4x64_epi64(S3, 0x00D8); t0 = _mm256_unpacklo_epi8(S0, S1);//0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 t1 = _mm256_unpacklo_epi8(S2, S3);//2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);// t3 = _mm256_unpackhi_epi16(t0, t1);//........15 16 17 18 t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D0 = _mm256_hadds_epi16(t0, t1);//0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 D0 = _mm256_permute4x64_epi64(D0, 0x00D8); D0 = _mm256_add_epi16(D0, off); D0 = _mm256_srli_epi16(D0, 7); t0 = _mm256_unpackhi_epi8(S0, S1);//16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24...24 25 25.. t1 = _mm256_unpackhi_epi8(S2, S3);//18 19 19 20 ..... t0 = _mm256_permute4x64_epi64(t0, 0x00D8); t1 = _mm256_permute4x64_epi64(t1, 0x00D8); t2 = _mm256_unpacklo_epi16(t0, t1);//16 17 18 19... t3 = _mm256_unpackhi_epi16(t0, t1);//24 25 26 27... t0 = _mm256_maddubs_epi16(t2, c0); t1 = _mm256_maddubs_epi16(t3, c0); D1 = _mm256_hadds_epi16(t0, t1);//16 17 18 19 24 25 26 27 20 21 22 23 28 29 30 31 D1 = _mm256_permute4x64_epi64(D1, 0x00D8); D1 = _mm256_add_epi16(D1, off); D1 = _mm256_srli_epi16(D1, 7); D0 = _mm256_packus_epi16(D0, D1); D0 = _mm256_permute4x64_epi64(D0, 0x00D8); _mm256_storeu_si256((__m256i*)(dst + i), D0); } dst += i_dst; } } } void intra_pred_ang_y_25_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { UNUSED_PARAMETER(dir_mode); int i; if (bsx > 8) { ALIGN32(pel_t first_line[64 + (64 << 3)]); int line_size = bsx + ((bsy - 1) << 3); int iHeight8 = bsy << 3; pel_t *pfirst = first_line; __m256i coeff0 = _mm256_setr_epi16( 7, 3, 5, 1, 3, 1, 1, 0, 7, 3, 5, 1, 3, 1, 1, 0); __m256i coeff1 = _mm256_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1, 15, 7, 13, 3, 11, 5, 9, 1); __m256i coeff2 = _mm256_setr_epi16( 9, 5, 11, 3, 13, 7, 15, 2, 9, 5, 11, 3, 13, 7, 15, 2); __m256i coeff3 = _mm256_setr_epi16( 1, 1, 3, 1, 5, 3, 7, 1, 1, 1, 3, 1, 5, 3, 7, 1); __m256i coeff4 = _mm256_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 4, 16, 8, 16, 2); __m256i coeff5 = _mm256_setr_epi16( 1, 2, 1, 4, 1, 2, 1, 8, 1, 2, 1, 4, 1, 2, 1, 8); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i res1, res2; __m256i L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); __m256i L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]); __m256i L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]); __m256i L3 = _mm256_setr_epi16(src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7]); src -= 4; for (i = 0; i < line_size; i += 64, src -= 4) { p00 = _mm256_mullo_epi16(L0, coeff0);//0...4... p10 = _mm256_mullo_epi16(L1, coeff1);//1...5... p20 = _mm256_mullo_epi16(L2, coeff2);//2...6... p30 = _mm256_mullo_epi16(L3, coeff3);//3...7... p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]);//4 8 p01 = _mm256_mullo_epi16(L1, coeff0);//1...5... p11 = _mm256_mullo_epi16(L2, coeff1);//2...6... p21 = _mm256_mullo_epi16(L3, coeff2);//3...7... p31 = _mm256_mullo_epi16(L0, coeff3);//4...8... p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]);//5 9 p00 = _mm256_mullo_epi16(L2, coeff0);//2...6... p10 = _mm256_mullo_epi16(L3, coeff1);//3...7... p20 = _mm256_mullo_epi16(L0, coeff2);//4...8... p30 = _mm256_mullo_epi16(L1, coeff3);//5...9... p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]);//6 10 p01 = _mm256_mullo_epi16(L3, coeff0);//3...7... p11 = _mm256_mullo_epi16(L0, coeff1);//4...8... p21 = _mm256_mullo_epi16(L1, coeff2);//5...9... p31 = _mm256_mullo_epi16(L2, coeff3);//6...10... p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res2 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute2x128_si256(res1, res2, 0x0020); _mm256_storeu_si256((__m256i*)pfirst, p00); pfirst += 32; p00 = _mm256_permute2x128_si256(res1, res2, 0x0031); _mm256_storeu_si256((__m256i*)pfirst, p00); pfirst += 32; src -= 4; L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]);//8 12 L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]);//9 13 L2 = _mm256_setr_epi16(src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-2], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6], src[-6]);//10 14 L3 = _mm256_setr_epi16(src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-3], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7], src[-7]);//11 15 } //if (bsx == 16) {// 8 // __m256i mask = _mm256_loadu_si256((__m256i*)intrinsic_mask_256_8bit[7]); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // p00 = _mm256_packus_epi16(p00, p00); // p00 = _mm256_permute4x64_epi64(p00, 0x0008); // _mm256_maskstore_epi64((__m256i*)pfirst, mask, p00); //} else if(bsx == 32){ // __m256i mask = _mm256_set_epi64x(0, -1, -1, -1); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], // src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); // // p01 = _mm256_mullo_epi16(L1, coeff0); // p11 = _mm256_mullo_epi16(L2, coeff1); // p21 = _mm256_mullo_epi16(L3, coeff2); // p31 = _mm256_mullo_epi16(L0, coeff3); // p01 = _mm256_add_epi16(p01, coeff4); // p01 = _mm256_add_epi16(p01, p11); // p01 = _mm256_add_epi16(p01, p21); // p01 = _mm256_add_epi16(p01, p31); // p01 = _mm256_mullo_epi16(p01, coeff5); // p01 = _mm256_srli_epi16(p01, 5); // // p00 = _mm256_packus_epi16(p00, p01); // p00 = _mm256_permute4x64_epi64(p00, 0x00D8); // _mm256_maskstore_epi64((__int64 *)pfirst, mask, p00); // //} else { // __m256i mask = _mm256_set_epi64x(0, -1, -1, -1); // p00 = _mm256_mullo_epi16(L0, coeff0); // p10 = _mm256_mullo_epi16(L1, coeff1); // p20 = _mm256_mullo_epi16(L2, coeff2); // p30 = _mm256_mullo_epi16(L3, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // L0 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], // src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4], src[-4]); // // p01 = _mm256_mullo_epi16(L1, coeff0); // p11 = _mm256_mullo_epi16(L2, coeff1); // p21 = _mm256_mullo_epi16(L3, coeff2); // p31 = _mm256_mullo_epi16(L0, coeff3); // p01 = _mm256_add_epi16(p01, coeff4); // p01 = _mm256_add_epi16(p01, p11); // p01 = _mm256_add_epi16(p01, p21); // p01 = _mm256_add_epi16(p01, p31); // p01 = _mm256_mullo_epi16(p01, coeff5); // p01 = _mm256_srli_epi16(p01, 5); // // p00 = _mm256_packus_epi16(p00, p01); // p00 = _mm256_permute4x64_epi64(p00, 0x00D8); // _mm256_storeu_si256((__m256*)pfirst, p00); // // pfirst += 32; // // L1 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], // src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5], src[-5]); // // p00 = _mm256_mullo_epi16(L2, coeff0); // p10 = _mm256_mullo_epi16(L3, coeff1); // p20 = _mm256_mullo_epi16(L0, coeff2); // p30 = _mm256_mullo_epi16(L1, coeff3); // p00 = _mm256_add_epi16(p00, coeff4); // p00 = _mm256_add_epi16(p00, p10); // p00 = _mm256_add_epi16(p00, p20); // p00 = _mm256_add_epi16(p00, p30); // p00 = _mm256_mullo_epi16(p00, coeff5); // p00 = _mm256_srli_epi16(p00, 5); // // p00 = _mm256_packus_epi16(p00, p00); // p00 = _mm256_permute4x64_epi64(p00, 0x0008); // _mm256_maskstore_epi64((__int64 *)pfirst, mask, p00); // //} __m256i M; if (bsx == 64) { for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + +8 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight8; i += 32){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 16)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 24)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < iHeight8; i += 8) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else {//8x8 8x32 4x4 4x16 intra_pred_ang_y_25_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return ; } } void intra_pred_ang_y_26_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i shuffle = _mm256_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8, 7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); ALIGN32(pel_t first_line[64 + 256]); int line_size = bsx + (bsy - 1) * 4; int iHeight4 = bsy << 2; src -= 31; __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i M1, M2, M3, M4, M5, M6, M7, M8; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < line_size - 64; i += 128, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src)); //15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 S1 = _mm256_loadu_si256((__m256i*)(src - 1));//16 15 14... S2 = _mm256_loadu_si256((__m256i*)(src - 2));//17 16 15... S3 = _mm256_loadu_si256((__m256i*)(src - 3));//18 17 16... L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//15 14 13 12 11 10 9 8 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//16 15 14... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15... L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4);//31...16 p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//15...0 p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4);//31...16 p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4);//15...0 p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M1 = _mm256_packus_epi16(M1, M3); M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle); M5 = _mm256_shuffle_epi8(M5, shuffle); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); //M1 = _mm256_permute4x64_epi64(M1, 0x4E); //M5 = _mm256_permute4x64_epi64(M5, 0x4E); //M2 = _mm256_permute4x64_epi64(M2, 0x4E); //M6 = _mm256_permute4x64_epi64(M6, 0x4E); M1 = _mm256_permute4x64_epi64(M1, 0x72); M5 = _mm256_permute4x64_epi64(M5, 0x72); M2 = _mm256_permute4x64_epi64(M2, 0x72); M6 = _mm256_permute4x64_epi64(M6, 0x72); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M4); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M8); _mm256_storeu_si256((__m256i*)&first_line[64 + i], M3); _mm256_storeu_si256((__m256i*)&first_line[96 + i], M7); } if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src)); //15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 S1 = _mm256_loadu_si256((__m256i*)(src - 1));//16 15 14... S2 = _mm256_loadu_si256((__m256i*)(src - 2));//17 16 15... S3 = _mm256_loadu_si256((__m256i*)(src - 3));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//15...0 p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4);//15...0 p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); //M2 = _mm256_permute4x64_epi64(M2, 0x4E); //M6 = _mm256_permute4x64_epi64(M6, 0x4E); M2 = _mm256_permute4x64_epi64(M2, 0x72); M6 = _mm256_permute4x64_epi64(M6, 0x72); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M4); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M8); } __m256i M; if (bsx == 64) { for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 4)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 8)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 12)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < iHeight4; i += 16){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 4)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 8)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 12)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } /*switch (bsx) { case 4: for (i = 0; i < iHeight4; i += 4) { CP32(dst, first_line + i); dst += i_dst; } break; case 8: for (i = 0; i < iHeight4; i += 4) { CP64(dst, first_line + i); dst += i_dst; } break; default: for (i = 0; i < iHeight4; i += 4) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } break; }*/ } else { //4x4 4x16 intra_pred_ang_y_26_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } void intra_pred_ang_y_28_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 128]); int line_size = bsx + (bsy - 1) * 2; int i; int iHeight2 = bsy << 1; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8, 7, 15, 6, 14, 5, 13, 4, 12, 3, 11, 2, 10, 1, 9, 0, 8); src -= 31; __m256i p00, p10; __m256i p01, p11; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; #if BUGFIX_PREDICTION_INTRA for (i = 0; i < line_size - 32; i += 64, src -= 32) { #else for (i = 0; i < real_size - 32; i += 64, src -= 32) { #endif S0 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 3)); S1 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//15 14 13 12 11 10 9 8 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//16 15 14... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15... L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0));//18 17 16... H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_adds_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3);//031...016 p01 = _mm256_add_epi16(L1, L2); p11 = _mm256_add_epi16(L2, L3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//131...116 p00 = _mm256_packus_epi16(p00, p01);// p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i + 32], p00); p00 = _mm256_adds_epi16(H1, H2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_adds_epi16(H0, H3); p10 = _mm256_adds_epi16(p10, coeff4); p00 = _mm256_adds_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } #if BUGFIX_PREDICTION_INTRA if (i < line_size) { #else if (i < real_size) { #endif S0 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 3)); S1 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 2)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//7 6 5 4 3 2 1 0 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//8 7 6.. H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//9 8 7... H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1));//10 9 8... p00 = _mm256_adds_epi16(H1, H2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_adds_epi16(H0, H3); p10 = _mm256_adds_epi16(p10, coeff4); p00 = _mm256_adds_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_shuffle_epi8(p00, shuffle); p00 = _mm256_permute4x64_epi64(p00, 0x4E); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } if (bsx == 64){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 2] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 4] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(&first_line[i + 6] + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < iHeight2; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < iHeight2; i += 8){ __m256i M = _mm256_lddqu_si256((__m256i*)&first_line[i]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 2]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 4]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)&first_line[i + 6]); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx >= 16) { for (i = 0; i < iHeight2; i += 2) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 8) { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; M = _mm_srli_si128(M, 2); _mm_storel_epi64((__m128i*)(dst), M); dst += i_dst; } } else { for (i = 0; i < iHeight2; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 2); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } }*/ } void intra_pred_ang_y_30_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; UNUSED_PARAMETER(dir_mode); int i; __m256i coeff2 = _mm256_set1_epi16(2); __m256i shuffle = _mm256_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); __m256i p00, p10; __m256i p01, p11; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; src -= 33; for (i = 0; i < line_size - 16; i += 32, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//35 34 33... L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//34 33 32... L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//20 19 18... H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//19 18 17... H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2);//31...24 23...16 p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//15..8 7...0 p00 = _mm256_packus_epi16(p00, p01);//32...24 15...8 23...16 7...0 p00 = _mm256_permute4x64_epi64(p00, 0x8D); p00 = _mm256_shuffle_epi8(p00, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//20 19 18... H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//19 18 17... H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));//18 p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2);//15...8..7..0 p01 = _mm256_packus_epi16(p01, p01);//15...8 15...8 7...0 7...0 p01 = _mm256_permute4x64_epi64(p01, 0x0008); p01 = _mm256_shuffle_epi8(p01, shuffle); _mm256_maskstore_epi64((__int64 *)&first_line[i], mask, p01); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 1)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 2)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(first_line + i + 32 + 3)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)(first_line + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(first_line + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx > 16) { for (i = 0; i < bsy; i++) { memcpy(dst, first_line + i, bsx * sizeof(pel_t)); dst += i_dst; } } else if (bsx == 16) { pel_t *dst1 = dst; if (bsy == 4) { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[0]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[1]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[2]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[3]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); } else { __m256i M = _mm256_loadu_si256((__m256i*)&first_line[0]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[1]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[2]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[3]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[4]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[5]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[6]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[7]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[8]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[9]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[10]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[11]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[12]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[13]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[14]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); dst1 += i_dst; M = _mm256_loadu_si256((__m256i*)&first_line[15]); _mm256_maskstore_epi64((__int64 *)dst1, mask, M); } } else if (bsx == 8) { for (i = 0; i < bsy; i += 8) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; M = _mm_srli_si128(M, 1); _mm_storel_epi64((__m128i*)dst, M); dst += i_dst; } } else { for (i = 0; i < bsy; i += 4) { __m128i M = _mm_loadu_si128((__m128i*)&first_line[i]); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; M = _mm_srli_si128(M, 1); ((int*)(dst))[0] = _mm_cvtsi128_si32(M); dst += i_dst; } }*/ } void intra_pred_ang_y_31_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx >= bsy){ ALIGN32(pel_t dst_tran[MAX_CU_SIZE * MAX_CU_SIZE]); ALIGN32(pel_t src_tran[MAX_CU_SIZE << 3]); for (i = 0; i < (bsy + bsx * 11 / 8 + 3); i++){ src_tran[i] = src[-i]; } intra_pred_ang_x_5_avx(src_tran, dst_tran, bsy, 5, bsy, bsx); for (i = 0; i < bsy; i++){ for (int j = 0; j < bsx; j++){ dst[j + i_dst * i] = dst_tran[i + bsy * j]; } } } else if (bsx == 8){ __m128i coeff0 = _mm_setr_epi16( 5, 1, 7, 1, 1, 3, 3, 1); __m128i coeff1 = _mm_setr_epi16(13, 5, 15, 3, 9, 7, 11, 2); __m128i coeff2 = _mm_setr_epi16(11, 7, 9, 3, 15, 5, 13, 1); __m128i coeff3 = _mm_setr_epi16( 3, 3, 1, 1, 7, 1, 5, 0); __m128i coeff4 = _mm_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2); __m128i coeff5 = _mm_setr_epi16( 1, 2, 1, 4, 1, 2, 1, 8); __m128i L0, L1, L2, L3; __m128i p00, p10, p20, p30; for (i = 0; i < bsy; i++,src--){ L0 = _mm_setr_epi16(src[-1], src[-2], src[-4], src[-5], src[-6], src[ -8], src[ -9], src[-11]); L1 = _mm_setr_epi16(src[-2], src[-3], src[-5], src[-6], src[-7], src[ -9], src[-10], src[-12]); L2 = _mm_setr_epi16(src[-3], src[-4], src[-6], src[-7], src[-8], src[-10], src[-11], src[-13]); L3 = _mm_setr_epi16(src[-4], src[-5], src[-7], src[-8], src[-9], src[-11], src[-12], src[-14]); p00 = _mm_mullo_epi16(L0, coeff0); p10 = _mm_mullo_epi16(L1, coeff1); p20 = _mm_mullo_epi16(L2, coeff2); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(p00, coeff4); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_mullo_epi16(p00, coeff5); p00 = _mm_srli_epi16(p00, 5); p00 = _mm_packus_epi16(p00, p00); _mm_storel_epi64((__m128i*)dst, p00); dst += i_dst; } } else { intra_pred_ang_y_31_sse128(src, dst, i_dst, dir_mode, bsx, bsy); } } void intra_pred_ang_y_32_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 64)]); int line_size = (bsy >> 1) + bsx - 1; int i; int aligned_line_size = ((line_size + 63) >> 4) << 4; pel_t *pfirst[2]; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i shuffle = _mm256_setr_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= 34; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; __m256i p00, p01, p10, p11; __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < line_size - 8; i += 16, src -= 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 S1 = _mm256_loadu_si256((__m256i*)(src)); //18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 S2 = _mm256_loadu_si256((__m256i*)(src + 1));//17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//19 18 17 16 15 14 13 12 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0));//18 17 16 15 14 13 12 11 L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0));//17 16 15 14 13 12 11 10 H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//11 10 9 8 7 6 5 4 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//10 9 8 7 6 5 4 3 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));// 9 8 7 6 5 4 3 2 p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2);//19...12(31...16) p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p10 = _mm256_add_epi16(p10, coeff2); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 2);//11...4(15...0) //31...24 15...8 23...16 7...0 p00 = _mm256_packus_epi16(p00, p10); //19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 p00 = _mm256_permute4x64_epi64(p00, 0x8D);//31...16 15..0 //0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 16.... p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x0D); p00 = _mm256_permute4x64_epi64(p00, 0x08); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p10); } mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[8]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 S1 = _mm256_loadu_si256((__m256i*)(src)); //18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 S2 = _mm256_loadu_si256((__m256i*)(src + 1));//17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//11 10 9 8 7 6 5 4 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1));//10 9 8 7 6 5 4 3 H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1));// 9 8 7 6 5 4 3 2 p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p10 = _mm256_add_epi16(p10, coeff2); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 2); //15...8 15...8 7...0 7...0 p00 = _mm256_packus_epi16(p10, p10); //19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 p00 = _mm256_permute4x64_epi64(p00, 0x8D);//15...0 15...0 //0 2 4 6 8 10 12 14 1 3 5 7 1 3 5 7 8.... p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x0D); p00 = _mm256_permute4x64_epi64(p00, 0x08); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p10); ; } bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] + i + 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*if (bsx >= 16 || bsx == 4) { for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] + i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] + i, bsx * sizeof(pel_t)); dst += i_dst2; } } else { if (bsy == 4) {//8x8 __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][0]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][0]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); } else {//8x32 for (i = 0; i < 16; i = i + 8) { __m128i M1 = _mm_loadu_si128((__m128i*)&pfirst[0][i]); __m128i M2 = _mm_loadu_si128((__m128i*)&pfirst[1][i]); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; M1 = _mm_srli_si128(M1, 1); M2 = _mm_srli_si128(M2, 1); _mm_storel_epi64((__m128i*)dst, M1); _mm_storel_epi64((__m128i*)(dst + i_dst), M2); dst += i_dst2; } } }*/ } void intra_pred_ang_xy_13_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsy > 4) { __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i coeff9 = _mm256_set1_epi16(9); __m256i coeff11 = _mm256_set1_epi16(11); __m256i coeff13 = _mm256_set1_epi16(13); __m256i coeff15 = _mm256_set1_epi16(15); __m256i coeff16 = _mm256_set1_epi16(16); ALIGN32(pel_t first_line[(64 + 16) << 3]); int line_size = bsx + (bsy >> 3) - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 15) >> 4) << 4; pel_t *pfirst[8]; pfirst[0] = first_line; pfirst[1] = pfirst[0] + aligned_line_size; pfirst[2] = pfirst[1] + aligned_line_size; pfirst[3] = pfirst[2] + aligned_line_size; pfirst[4] = pfirst[3] + aligned_line_size; pfirst[5] = pfirst[4] + aligned_line_size; pfirst[6] = pfirst[5] + aligned_line_size; pfirst[7] = pfirst[6] + aligned_line_size; src -= bsy - 8; for (i = 0; i < left_size; i++, src += 8) {//left size`s value is small ,there is no need to use intrinsic assmble pfirst[0][i] = (pel_t)((src[6] + (src[7] << 1) + src[8] + 2) >> 2); pfirst[1][i] = (pel_t)((src[5] + (src[6] << 1) + src[7] + 2) >> 2); pfirst[2][i] = (pel_t)((src[4] + (src[5] << 1) + src[6] + 2) >> 2); pfirst[3][i] = (pel_t)((src[3] + (src[4] << 1) + src[5] + 2) >> 2); pfirst[4][i] = (pel_t)((src[2] + (src[3] << 1) + src[4] + 2) >> 2); pfirst[5][i] = (pel_t)((src[1] + (src[2] << 1) + src[3] + 2) >> 2); pfirst[6][i] = (pel_t)((src[0] + (src[1] << 1) + src[2] + 2) >> 2); pfirst[7][i] = (pel_t)((src[-1] + (src[0] << 1) + src[1] + 2) >> 2); } __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff7); p10 = _mm256_mullo_epi16(L1, coeff15); p20 = _mm256_mullo_epi16(L2, coeff9); p30 = _mm256_add_epi16(L3, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff7); p11 = _mm256_mullo_epi16(H1, coeff15); p21 = _mm256_mullo_epi16(H2, coeff9); p31 = _mm256_add_epi16(H3, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p00 = _mm256_mullo_epi16(L0, coeff5); p10 = _mm256_mullo_epi16(L1, coeff13); p20 = _mm256_mullo_epi16(L2, coeff11); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff5); p11 = _mm256_mullo_epi16(H1, coeff13); p21 = _mm256_mullo_epi16(H2, coeff11); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(L1, L2); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(H1, H2); p11 = _mm256_mullo_epi16(p11, coeff3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff11); p20 = _mm256_mullo_epi16(L2, coeff13); p30 = _mm256_mullo_epi16(L3, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff11); p21 = _mm256_mullo_epi16(H2, coeff13); p31 = _mm256_mullo_epi16(H3, coeff5); p01 = _mm256_add_epi16(p01, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[4][i], p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[5][i], p00); p10 = _mm256_mullo_epi16(L1, coeff9); p20 = _mm256_mullo_epi16(L2, coeff15); p30 = _mm256_mullo_epi16(L3, coeff7); p00 = _mm256_add_epi16(L0, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p11 = _mm256_mullo_epi16(H1, coeff9); p21 = _mm256_mullo_epi16(H2, coeff15); p31 = _mm256_mullo_epi16(H3, coeff7); p01 = _mm256_add_epi16(H0, coeff16); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 5); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[6][i], p00); p10 = _mm256_mullo_epi16(L2, coeff2); p00 = _mm256_add_epi16(L1, L3); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p11 = _mm256_mullo_epi16(H2, coeff2); p01 = _mm256_add_epi16(H1, H3); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[7][i], p00); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[bsx - 1]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src)); S3 = _mm256_loadu_si256((__m256i*)(src - 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff7); p10 = _mm256_mullo_epi16(L1, coeff15); p20 = _mm256_mullo_epi16(L2, coeff9); p30 = _mm256_add_epi16(L3, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[0][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[1][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff5); p10 = _mm256_mullo_epi16(L1, coeff13); p20 = _mm256_mullo_epi16(L2, coeff11); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(L1, L2); p10 = _mm256_mullo_epi16(p10, coeff3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[3][i], mask, p00); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff11); p20 = _mm256_mullo_epi16(L2, coeff13); p30 = _mm256_mullo_epi16(L3, coeff5); p00 = _mm256_add_epi16(p00, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[4][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[5][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff9); p20 = _mm256_mullo_epi16(L2, coeff15); p30 = _mm256_mullo_epi16(L3, coeff7); p00 = _mm256_add_epi16(L0, coeff16); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 5); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[6][i], mask, p00); p10 = _mm256_mullo_epi16(L2, coeff2); p00 = _mm256_add_epi16(L1, L3); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_maskstore_epi32((int*)&pfirst[7][i], mask, p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; pfirst[4] += left_size; pfirst[5] += left_size; pfirst[6] += left_size; pfirst[7] += left_size; bsy >>= 3; __m256i M; if (bsx == 64){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[4] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[5] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[6] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[7] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[4] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[5] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[6] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[7] - i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else { intra_pred_ang_xy_13_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } void intra_pred_ang_xy_14_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); if (bsy != 4) { ALIGN32(pel_t first_line[4 * (64 + 32)]); int line_size = bsx + bsy / 4 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[4]; __m256i shuffle = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); __m256i index = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); pel_t *pSrc1 = src; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; pfirst[2] = first_line + aligned_line_size * 2; pfirst[3] = first_line + aligned_line_size * 3; src -= bsy - 4; __m256i p00, p01, p10, p11; __m256i p20, p30, p21, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; __m256i mask0 = _mm256_set_epi64x(0, 0, 0, -1); __m256i mask1 = _mm256_set_epi64x(0, 0, -1, 0); __m256i mask2 = _mm256_set_epi64x(0, -1, 0, 0); __m256i mask3 = _mm256_set_epi64x(-1, 0, 0, 0); for (i = 0; i < left_size - 1; i += 8, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0 1 2 3 4 5 6 7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...15 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1));//16...31 H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2);//0...7 8...15 p10 = _mm256_srli_epi16(p10, 2);//16...23 24...31 p00 = _mm256_packus_epi16(p00, p10);//0...7 16...23 8...15 24...31 p00 = _mm256_permute4x64_epi64(p00, 0x00D8); //0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15 16 20 24 28 17 21... p10 = _mm256_shuffle_epi8(p00, shuffle); //0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 p10 = _mm256_permutevar8x32_epi32(p10, index); _mm256_maskstore_epi64(((__int64 *)(pfirst[0] + i - 24)), mask3, p10); _mm256_maskstore_epi64(((__int64 *)(pfirst[1] + i - 16)), mask2, p10); _mm256_maskstore_epi64(((__int64 *)(pfirst[2] + i - 8 )), mask1, p10); _mm256_maskstore_epi64(((__int64 *)(pfirst[3] + i )), mask0, p10); } if (i < left_size) { //sse汾avx죬ݽ __m128i shuffle1 = _mm_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 0, 4, 1, 5, 2, 6, 3, 7); __m128i coeff2 = _mm_set1_epi16(2); __m128i zero = _mm_setzero_si128(); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i L0 = _mm_unpacklo_epi8(S0, zero);//0 1 2 3 4 5 6 7 __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i p00 = _mm_add_epi16(L0, L1); __m128i p01 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_add_epi16(p00, p01); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00);//0 1 2 3 4 5 6 7 p00 = _mm_shuffle_epi8(p00, shuffle1);//0 4 1 5 2 6 3 7 ((int*)&pfirst[0][i])[0] = _mm_extract_epi16(p00, 3); ((int*)&pfirst[1][i])[0] = _mm_extract_epi16(p00, 2); ((int*)&pfirst[2][i])[0] = _mm_extract_epi16(p00, 1); ((int*)&pfirst[3][i])[0] = _mm_extract_epi16(p00, 0); } src = pSrc1; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[2][i], p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); p01 = _mm256_srli_epi16(p01, 3); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_srli_epi16(p01, 4); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[3][i], p00); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[2][i], mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask, p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[3][i], mask, p00); } pfirst[0] += left_size; pfirst[1] += left_size; pfirst[2] += left_size; pfirst[3] += left_size; bsy >>= 2; if (bsx == 64){ for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32) { for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i++){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[2] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[3] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[2] - i, bsx * sizeof(pel_t)); dst += i_dst; memcpy(dst, pfirst[3] - i, bsx * sizeof(pel_t)); dst += i_dst; }*/ } else { if (bsx == 16) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m256i p00, p10, p20, p30; __m256i L0, L1, L2, L3; __m256i S0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i S3 = _mm256_loadu_si256((__m256i*)(src + 2)); __m256i S1 = _mm256_loadu_si256((__m256i*)(src)); __m256i S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst3, mask, p00); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst2, mask, p00); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_srli_epi16(p00, 4); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst, mask, p00); p00 = _mm256_add_epi16(L0, L1); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)dst4, mask, p00); } else {//4x4 pel_t *dst2 = dst + i_dst; pel_t *dst3 = dst2 + i_dst; pel_t *dst4 = dst3 + i_dst; __m128i p00, p10, p20, p30; __m128i coeff2 = _mm_set1_epi16(2); __m128i coeff3 = _mm_set1_epi16(3); __m128i coeff4 = _mm_set1_epi16(4); __m128i coeff5 = _mm_set1_epi16(5); __m128i coeff7 = _mm_set1_epi16(7); __m128i coeff8 = _mm_set1_epi16(8); __m128i zero = _mm_setzero_si128(); __m128i S0 = _mm_loadu_si128((__m128i*)(src - 1)); __m128i S3 = _mm_loadu_si128((__m128i*)(src + 2)); __m128i S1 = _mm_loadu_si128((__m128i*)(src)); __m128i S2 = _mm_loadu_si128((__m128i*)(src + 1)); __m128i L0 = _mm_unpacklo_epi8(S0, zero); __m128i L1 = _mm_unpacklo_epi8(S1, zero); __m128i L2 = _mm_unpacklo_epi8(S2, zero); __m128i L3 = _mm_unpacklo_epi8(S3, zero); p00 = _mm_mullo_epi16(L0, coeff3); p10 = _mm_mullo_epi16(L1, coeff7); p20 = _mm_mullo_epi16(L2, coeff5); p30 = _mm_add_epi16(L3, coeff8); p00 = _mm_add_epi16(p00, p30); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst3)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L1, L2); p00 = _mm_mullo_epi16(p00, coeff3); p10 = _mm_add_epi16(L0, L3); p10 = _mm_add_epi16(p10, coeff4); p00 = _mm_add_epi16(p10, p00); p00 = _mm_srli_epi16(p00, 3); p00 = _mm_packus_epi16(p00, p00); ((int*)dst2)[0] = _mm_cvtsi128_si32(p00); p10 = _mm_mullo_epi16(L1, coeff5); p20 = _mm_mullo_epi16(L2, coeff7); p30 = _mm_mullo_epi16(L3, coeff3); p00 = _mm_add_epi16(L0, coeff8); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, p20); p00 = _mm_add_epi16(p00, p30); p00 = _mm_srli_epi16(p00, 4); p00 = _mm_packus_epi16(p00, p00); ((int*)dst)[0] = _mm_cvtsi128_si32(p00); p00 = _mm_add_epi16(L0, L1); p10 = _mm_add_epi16(L1, L2); p00 = _mm_add_epi16(p00, p10); p00 = _mm_add_epi16(p00, coeff2); p00 = _mm_srli_epi16(p00, 2); p00 = _mm_packus_epi16(p00, p00); ((int*)dst4)[0] = _mm_cvtsi128_si32(p00); } } } void intra_pred_ang_xy_16_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[2 * (64 + 48)]); int line_size = bsx + bsy / 2 - 1; int left_size = line_size - bsx; int aligned_line_size = ((line_size + 31) >> 4) << 4; pel_t *pfirst[2]; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); int i; pel_t *pSrc1; pfirst[0] = first_line; pfirst[1] = first_line + aligned_line_size; src -= bsy - 2; pSrc1 = src; __m256i p00, p01, p10, p11; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; __m256i mask1 = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < left_size - 8; i += 16, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));// S2 = _mm256_loadu_si256((__m256i*)(src + 1));// S1 = _mm256_loadu_si256((__m256i*)(src));// L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2);//0 1 2 3 4 5 6 7....15 p10 = _mm256_srli_epi16(p10, 2);//16 17 18....31 //0...7 16...23 8...15 24...31 p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8);//31...16 15..0 //0 1 2 3 p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_permute4x64_epi64(p00, 0x08);//0 2 p00 = _mm256_permute4x64_epi64(p00, 0x0D);//1 3 _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask1, p00); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask1, p10); } __m256i mask2 = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[7]); if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2); //0...7 0...7 8...15 8...15 p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008);//0...15 0...15 p01 = _mm256_shuffle_epi8(p00, shuffle);//0 2 4 6 7 8 10 12 14 1 3 5 7 9 11 13 15 p10 = _mm256_permute4x64_epi64(p01, 0x01); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask2, p10); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask2, p01); } src = pSrc1 + left_size + left_size; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3); p10 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H0, H3); p10 = _mm256_mullo_epi16(p10, coeff3); p10 = _mm256_add_epi16(p10, coeff4); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 3); p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[0][i], p00); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(H0, H1); p11 = _mm256_add_epi16(H1, H2); p00 = _mm256_add_epi16(p00, coeff2); p10 = _mm256_add_epi16(p10, coeff2); p00 = _mm256_add_epi16(p00, p01); p10 = _mm256_add_epi16(p10, p11); p00 = _mm256_srli_epi16(p00, 2); p10 = _mm256_srli_epi16(p10, 2); p00 = _mm256_packus_epi16(p00, p10); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&pfirst[1][i], p00); } if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[0][i], mask1, p00); p00 = _mm256_add_epi16(L0, L1); p01 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 2); p00 = _mm256_packus_epi16(p00, p00); p00 = _mm256_permute4x64_epi64(p00, 0x0008); _mm256_maskstore_epi64((__int64 *)&pfirst[1][i], mask1, p00); } pfirst[0] += left_size; pfirst[1] += left_size; bsy >>= 1; if (bsx == 64){ for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3 + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 1)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 2)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[0] - i - 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; M = _mm256_lddqu_si256((__m256i*)(pfirst[1] - i - 3)); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; } } /*switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst[0] - i); CP32(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst[0] - i); CP64(dst + i_dst, pfirst[1] - i); dst += (i_dst << 1); } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst[0] - i, bsx * sizeof(pel_t)); memcpy(dst + i_dst, pfirst[1] - i, bsx * sizeof(pel_t)); dst += (i_dst << 1); } break; }*/ } void intra_pred_ang_xy_18_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 64]); int line_size = bsx + bsy - 1; int i; pel_t *pfirst = first_line + bsy - 1; UNUSED_PARAMETER(dir_mode); __m256i coeff2 = _mm256_set1_epi16(2); src -= bsy - 1; __m256i S0, S1, S2; __m256i L0, L1, L2; __m256i H0, H1, H2; __m256i sum1, sum2, sum3, sum4; for (i = 0; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64 *)&first_line[i], mask, sum1); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst--; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst--; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; } } else if (bsx == 8) { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst--; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[3]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst--; } } /*switch (bsx) { case 4: for (i = 0; i < bsy; i++) { CP32(dst, pfirst--); dst += i_dst; } break; case 8: for (i = 0; i < bsy; i++) { CP64(dst, pfirst--); dst += i_dst; } break; default: for (i = 0; i < bsy; i++) { memcpy(dst, pfirst--, bsx * sizeof(pel_t)); dst += i_dst; } break; break; }*/ } void intra_pred_ang_xy_20_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { ALIGN32(pel_t first_line[64 + 128]); int left_size = (bsy - 1) * 2 + 1; int top_size = bsx - 1; int line_size = left_size + top_size; int i; pel_t *pfirst = first_line + left_size - 1; __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i shuffle = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); pel_t *pSrc1 = src; UNUSED_PARAMETER(dir_mode); src -= bsy; __m256i p00, p01, p10, p11; __m256i p20, p21, p30, p31; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < left_size - 32; i += 64, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0...7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...7 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_add_epi16(L1, L2); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_mullo_epi16(p00, coeff3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3);//0...15 p10 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H0, H3); p10 = _mm256_mullo_epi16(p10, coeff3); p10 = _mm256_add_epi16(p10, coeff4); p10 = _mm256_add_epi16(p10, p11); p10 = _mm256_srli_epi16(p10, 3);//16..31 p20 = _mm256_add_epi16(L1, L2); p21 = _mm256_add_epi16(L2, L3); p20 = _mm256_add_epi16(p20, coeff2); p20 = _mm256_add_epi16(p20, p21); p20 = _mm256_srli_epi16(p20, 2);//0...15 p30 = _mm256_add_epi16(H1, H2); p31 = _mm256_add_epi16(H2, H3); p30 = _mm256_add_epi16(p30, coeff2); p30 = _mm256_add_epi16(p30, p31); p30 = _mm256_srli_epi16(p30, 2);//16...31 //00...07 10...17 08...015 18...115 p00 = _mm256_packus_epi16(p00, p20); p10 = _mm256_packus_epi16(p10, p30); p00 = _mm256_shuffle_epi8(p00, shuffle); p10 = _mm256_shuffle_epi8(p10, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); _mm256_storeu_si256((__m256i*)&first_line[i + 32], p10); } if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1));//0...7 8...15 S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0));//0...7 L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p01 = _mm256_add_epi16(L0, L3); p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p01); p00 = _mm256_srli_epi16(p00, 3);//0...15 p20 = _mm256_add_epi16(L1, L2); p21 = _mm256_add_epi16(L2, L3); p20 = _mm256_add_epi16(p20, coeff2); p20 = _mm256_add_epi16(p20, p21); p20 = _mm256_srli_epi16(p20, 2);//0...15 p00 = _mm256_packus_epi16(p00, p20); p00 = _mm256_shuffle_epi8(p00, shuffle); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } src = pSrc1; __m256i sum1, sum2, sum3, sum4; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); if (i < line_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64 *)&first_line[i], mask, sum1); } if (bsx == 64){ for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 2; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 2; } } else if (bsx == 16){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)(pfirst)); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; } } else if (bsx == 8){ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 8) { __m256i M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 2; } } else{ __m256i mask = _mm256_loadu_si256((const __m256i*)intrinsic_mask_256_8bit[bsx - 1]); for (i = 0; i < bsy; i += 4) { __m256i M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi32((int*)dst, mask, M); dst += i_dst; pfirst -= 2; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); pfirst -= 2; dst += i_dst; }*/ } void intra_pred_ang_xy_22_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx != 4) { src -= bsy; ALIGN32(pel_t first_line[64 + 256]); int left_size = (bsy - 1) * 4 + 3; int top_size = bsx - 3; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 3; pel_t *pSrc1 = src; __m256i coeff2 = _mm256_set1_epi16(2); __m256i coeff3 = _mm256_set1_epi16(3); __m256i coeff4 = _mm256_set1_epi16(4); __m256i coeff5 = _mm256_set1_epi16(5); __m256i coeff7 = _mm256_set1_epi16(7); __m256i coeff8 = _mm256_set1_epi16(8); __m256i shuffle = _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i M1, M2, M3, M4, M5, M6, M7, M8; __m256i S0, S1, S2, S3; __m256i L0, L1, L2, L3; __m256i H0, H1, H2, H3; for (i = 0; i < line_size - 64; i += 128, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); H3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 1)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4);//0...15 p01 = _mm256_mullo_epi16(H0, coeff3); p11 = _mm256_mullo_epi16(H1, coeff7); p21 = _mm256_mullo_epi16(H2, coeff5); p31 = _mm256_add_epi16(H3, coeff8); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); M2 = _mm256_srli_epi16(p01, 4);//16...31 p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p01 = _mm256_add_epi16(H1, H2); p01 = _mm256_mullo_epi16(p01, coeff3); p11 = _mm256_add_epi16(H0, H3); p11 = _mm256_add_epi16(p11, coeff4); p01 = _mm256_add_epi16(p11, p01); M4 = _mm256_srli_epi16(p01, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4); p11 = _mm256_mullo_epi16(H1, coeff5); p21 = _mm256_mullo_epi16(H2, coeff7); p31 = _mm256_mullo_epi16(H3, coeff3); p01 = _mm256_add_epi16(H0, coeff8); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); M6 = _mm256_srli_epi16(p01, 4); p00 = _mm256_add_epi16(L1, L2); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); p01 = _mm256_add_epi16(H1, H2); p11 = _mm256_add_epi16(H2, H3); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, coeff2); M8 = _mm256_srli_epi16(p01, 2); M1 = _mm256_packus_epi16(M1, M3);//00...08 10...18 M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle);//00 10 01 11 02 12... M5 = _mm256_shuffle_epi8(M5, shuffle); M2 = _mm256_packus_epi16(M2, M4); M6 = _mm256_packus_epi16(M6, M8); M2 = _mm256_shuffle_epi8(M2, shuffle); M6 = _mm256_shuffle_epi8(M6, shuffle); M1 = _mm256_permute4x64_epi64(M1, 0x00D8); M5 = _mm256_permute4x64_epi64(M5, 0x00D8); M2 = _mm256_permute4x64_epi64(M2, 0x00D8); M6 = _mm256_permute4x64_epi64(M6, 0x00D8); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); M4 = _mm256_unpacklo_epi16(M2, M6); M8 = _mm256_unpackhi_epi16(M2, M6); _mm256_storeu_si256((__m256i*)&first_line[i], M3); _mm256_storeu_si256((__m256i*)&first_line[32 + i], M7); _mm256_storeu_si256((__m256i*)&first_line[64 + i], M4); _mm256_storeu_si256((__m256i*)&first_line[96 + i], M8); } if (i < left_size) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S3 = _mm256_loadu_si256((__m256i*)(src + 2)); S1 = _mm256_loadu_si256((__m256i*)(src)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); L3 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S3, 0)); p00 = _mm256_mullo_epi16(L0, coeff3); p10 = _mm256_mullo_epi16(L1, coeff7); p20 = _mm256_mullo_epi16(L2, coeff5); p30 = _mm256_add_epi16(L3, coeff8); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); M1 = _mm256_srli_epi16(p00, 4); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_mullo_epi16(p00, coeff3); p10 = _mm256_add_epi16(L0, L3); p10 = _mm256_add_epi16(p10, coeff4); p00 = _mm256_add_epi16(p10, p00); M3 = _mm256_srli_epi16(p00, 3); p10 = _mm256_mullo_epi16(L1, coeff5); p20 = _mm256_mullo_epi16(L2, coeff7); p30 = _mm256_mullo_epi16(L3, coeff3); p00 = _mm256_add_epi16(L0, coeff8); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); M5 = _mm256_srli_epi16(p00, 4); p10 = _mm256_add_epi16(L2, L3); p00 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, coeff2); M7 = _mm256_srli_epi16(p00, 2); M1 = _mm256_packus_epi16(M1, M3); M5 = _mm256_packus_epi16(M5, M7); M1 = _mm256_shuffle_epi8(M1, shuffle); M5 = _mm256_shuffle_epi8(M5, shuffle); M1 = _mm256_permute4x64_epi64(M1, 0x00D8); M5 = _mm256_permute4x64_epi64(M5, 0x00D8); M3 = _mm256_unpacklo_epi16(M1, M5); M7 = _mm256_unpackhi_epi16(M1, M5); _mm256_store_si256((__m256i*)&first_line[i], M3); _mm256_store_si256((__m256i*)&first_line[32 + i], M7); } src = pSrc1 + bsy; __m256i sum1, sum2, sum3, sum4; for (i = left_size; i < line_size - 16; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum3 = _mm256_add_epi16(H0, H1); sum4 = _mm256_add_epi16(H1, H2); sum1 = _mm256_add_epi16(sum1, sum2); sum3 = _mm256_add_epi16(sum3, sum4); sum1 = _mm256_add_epi16(sum1, coeff2); sum3 = _mm256_add_epi16(sum3, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum3 = _mm256_srli_epi16(sum3, 2); sum1 = _mm256_packus_epi16(sum1, sum3); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], sum1); } if (i < line_size) { __m256i mask = _mm256_load_si256((__m256i*)intrinsic_mask_256_8bit[15]); S0 = _mm256_loadu_si256((__m256i*)(src - 1)); S2 = _mm256_loadu_si256((__m256i*)(src + 1)); S1 = _mm256_loadu_si256((__m256i*)(src)); L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); sum1 = _mm256_add_epi16(L0, L1); sum2 = _mm256_add_epi16(L1, L2); sum1 = _mm256_add_epi16(sum1, sum2); sum1 = _mm256_add_epi16(sum1, coeff2); sum1 = _mm256_srli_epi16(sum1, 2); sum1 = _mm256_packus_epi16(sum1, sum1); sum1 = _mm256_permute4x64_epi64(sum1, 0x00D8); _mm256_maskstore_epi64((__int64 *)&first_line[i], mask, sum1); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 4; } } else if (bsx == 32) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 4; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; } } else { __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[7]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 4; } } /* switch (bsx) { case 8: while (bsy--) { CP64(dst, pfirst); dst += i_dst; pfirst -= 4; } break; case 16: case 32: case 64: while (bsy--) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 4; } break; default: assert(0); break; }*/ } else {//4x4 4x16 for (i = 0; i < bsy; i++, src--) { dst[0] = (pel_t)((src[-2] * 3 + src[-1] * 7 + src[0] * 5 + src[1] + 8) >> 4); dst[1] = (pel_t)((src[-2] + (src[-1] + src[0]) * 3 + src[1] + 4) >> 3); dst[2] = (pel_t)((src[-2] + src[-1] * 5 + src[0] * 7 + src[1] * 3 + 8) >> 4); dst[3] = (pel_t)(( src[-1] + src[0] * 2 + src[1] + 2) >> 2); dst += i_dst; } } } void intra_pred_ang_xy_23_avx(pel_t *src, pel_t *dst, int i_dst, int dir_mode, int bsx, int bsy) { int i; UNUSED_PARAMETER(dir_mode); if (bsx > 8) { ALIGN32(pel_t first_line[64 + 512]); int left_size = (bsy << 3) - 1; int top_size = bsx - 7; int line_size = left_size + top_size; pel_t *pfirst = first_line + left_size - 7; pel_t *pfirst1 = first_line; pel_t *src_org = src; src -= bsy; __m256i coeff0 = _mm256_setr_epi16(7, 3, 5, 1, 3, 1, 1, 0, 7, 3, 5, 1, 3, 1, 1, 0); __m256i coeff1 = _mm256_setr_epi16(15, 7, 13, 3, 11, 5, 9, 1, 15, 7, 13, 3, 11, 5, 9, 1); __m256i coeff2 = _mm256_setr_epi16(9, 5, 11, 3, 13, 7, 15, 2, 9, 5, 11, 3, 13, 7, 15, 2); __m256i coeff3 = _mm256_setr_epi16(1, 1, 3, 1, 5, 3, 7, 1, 1, 1, 3, 1, 5, 3, 7, 1); __m256i coeff4 = _mm256_setr_epi16(16, 8, 16, 4, 16, 8, 16, 2, 16, 8, 16, 4, 16, 8, 16, 2); __m256i coeff5 = _mm256_setr_epi16(1, 2, 1, 4, 1, 2, 1, 8, 1, 2, 1, 4, 1, 2, 1, 8); __m256i p00, p10, p20, p30; __m256i p01, p11, p21, p31; __m256i res1, res2; __m256i L0, L1, L2, L3; __m256i H0, H1, H2; if (bsy == 4){ L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1]);//-1 3 L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2]);//0 4 L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//1 5 L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//2 6 src += 4; for (i = 0; i < left_size + 1; i += 32) { p00 = _mm256_mullo_epi16(L0, coeff0);//-1 p10 = _mm256_mullo_epi16(L1, coeff1);//0 p20 = _mm256_mullo_epi16(L2, coeff2);//1 p30 = _mm256_mullo_epi16(L3, coeff3);//2 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1]);//-1 3 p01 = _mm256_mullo_epi16(L1, coeff0);//0 p11 = _mm256_mullo_epi16(L2, coeff1);//1 p21 = _mm256_mullo_epi16(L3, coeff2);//2 p31 = _mm256_mullo_epi16(L0, coeff3);//3 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); _mm256_storeu_si256((__m256i*)pfirst1, res1); } } else { L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//-1 3 L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//0 4 L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]);//1 5 L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[6], src[6], src[6], src[6], src[6], src[6], src[6], src[6]);//2 6 src += 4; for (i = 0; i < left_size + 1; i += 64, src += 4) { p00 = _mm256_mullo_epi16(L0, coeff0);//-1 3 p10 = _mm256_mullo_epi16(L1, coeff1);// 0 4 p20 = _mm256_mullo_epi16(L2, coeff2);// 1 5 p30 = _mm256_mullo_epi16(L3, coeff3);// 2 6 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]);//3 7 p01 = _mm256_mullo_epi16(L1, coeff0);//0 4 p11 = _mm256_mullo_epi16(L2, coeff1);//1 5 p21 = _mm256_mullo_epi16(L3, coeff2);//2 6 p31 = _mm256_mullo_epi16(L0, coeff3);//3 7 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res1 = _mm256_packus_epi16(p00, p01); L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]);//4 8 p00 = _mm256_mullo_epi16(L2, coeff0);//1 5 p10 = _mm256_mullo_epi16(L3, coeff1);//2 6 p20 = _mm256_mullo_epi16(L0, coeff2);//3 7 p30 = _mm256_mullo_epi16(L1, coeff3);//4 8 p00 = _mm256_add_epi16(p00, coeff4); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_add_epi16(p00, p20); p00 = _mm256_add_epi16(p00, p30); p00 = _mm256_mullo_epi16(p00, coeff5); p00 = _mm256_srli_epi16(p00, 5); L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]);//5 9 p01 = _mm256_mullo_epi16(L3, coeff0);//2 6 p11 = _mm256_mullo_epi16(L0, coeff1);//3 7 p21 = _mm256_mullo_epi16(L1, coeff2);//4 8 p31 = _mm256_mullo_epi16(L2, coeff3);//5 9 p01 = _mm256_add_epi16(p01, coeff4); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_add_epi16(p01, p21); p01 = _mm256_add_epi16(p01, p31); p01 = _mm256_mullo_epi16(p01, coeff5); p01 = _mm256_srli_epi16(p01, 5); res2 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute2x128_si256(res1, res2, 0x0020); _mm256_storeu_si256((__m256i*)pfirst1, p00); pfirst1 += 32; p00 = _mm256_permute2x128_si256(res1, res2, 0x0031); _mm256_storeu_si256((__m256i*)pfirst1, p00); pfirst1 += 32; src += 4; L0 = _mm256_setr_epi16(src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[-1], src[3], src[3], src[3], src[3], src[3], src[3], src[3], src[3]); L1 = _mm256_setr_epi16(src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[0], src[4], src[4], src[4], src[4], src[4], src[4], src[4], src[4]); L2 = _mm256_setr_epi16(src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[1], src[5], src[5], src[5], src[5], src[5], src[5], src[5], src[5]); L3 = _mm256_setr_epi16(src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[2], src[6], src[6], src[6], src[6], src[6], src[6], src[6], src[6]); } } src = src_org + 1; __m256i S0, S1, S2; coeff2 = _mm256_set1_epi16(2); for (; i < line_size; i += 32, src += 32) { S0 = _mm256_loadu_si256((__m256i*)(src)); S1 = _mm256_loadu_si256((__m256i*)(src + 1)); S2 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i L0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 0)); __m256i L1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 0)); __m256i L2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 0)); H0 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S0, 1)); H1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S1, 1)); H2 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(S2, 1)); p00 = _mm256_mullo_epi16(L0, coeff2); p10 = _mm256_add_epi16(L1, L2); p00 = _mm256_add_epi16(p00, coeff2); p00 = _mm256_add_epi16(p00, p10); p00 = _mm256_srli_epi16(p00, 2); p01 = _mm256_mullo_epi16(H0, coeff2); p11 = _mm256_add_epi16(H1, H2); p01 = _mm256_add_epi16(p01, coeff2); p01 = _mm256_add_epi16(p01, p11); p01 = _mm256_srli_epi16(p01, 2); p00 = _mm256_packus_epi16(p00, p01); p00 = _mm256_permute4x64_epi64(p00, 0x00D8); _mm256_storeu_si256((__m256i*)&first_line[i], p00); } __m256i M; if (bsx == 64) { for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); M = _mm256_lddqu_si256((__m256i*)(pfirst + 32)); _mm256_storeu_si256((__m256i*)(dst + 32), M); dst += i_dst; pfirst -= 8; } } else if (bsx == 32){ for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_storeu_si256((__m256i*)dst, M); dst += i_dst; pfirst -= 8; } } else if (bsx == 16){ __m256i mask = _mm256_lddqu_si256((__m256i*)intrinsic_mask_256_8bit[15]); for (i = 0; i < bsy; i += 4){ M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 8; M = _mm256_lddqu_si256((__m256i*)pfirst); _mm256_maskstore_epi64((__int64 *)dst, mask, M); dst += i_dst; pfirst -= 8; } } /*for (i = 0; i < bsy; i++) { memcpy(dst, pfirst, bsx * sizeof(pel_t)); dst += i_dst; pfirst -= 8; }*/ } else {//8x8 8x32 4x4 4x16------128bit is enough intra_pred_ang_xy_23_sse128(src, dst, i_dst, dir_mode, bsx, bsy); return; } } #endif davs2-1.6/source/common/vec/intrinsic_pixel.cc000066400000000000000000000137111337322544400214700ustar00rootroot00000000000000/* * intrinsic_pixel.cc * * Description of this file: * SSE assembly functions of Pixel-Processing module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include void avs_pixel_average_sse128(pel_t *dst, int i_dst, const pel_t *src0, int i_src0, const pel_t *src1, int i_src1, int width, int height) { #if HIGH_BIT_DEPTH int j; __m128i D; if (width & 7) { __m128i mask = _mm_load_si128((const __m128i *)intrinsic_mask_10bit[(width & 7) - 1]); while (height--) { for (j = 0; j < width - 7; j += 8) { D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src0 + j)), _mm_loadu_si128((const __m128i *)(src1 + j))); _mm_storeu_si128((__m128i *)(dst + j), D); } D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src0 + j)), _mm_loadu_si128((const __m128i *)(src1 + j))); _mm_maskmoveu_si128(D, mask, (char *)&dst[j]); src0 += i_src0; src1 += i_src1; dst += i_dst; } } else { while (height--) { for (j = 0; j < width; j += 8) { D = _mm_avg_epu16(_mm_loadu_si128((const __m128i *)(src0 + j)), _mm_loadu_si128((const __m128i *)(src1 + j))); _mm_storeu_si128((__m128i *)(dst + j), D); } src0 += i_src0; src1 += i_src1; dst += i_dst; } } #else int i, j; __m128i S1, S2, D; if (width & 15) { __m128i mask = _mm_load_si128((const __m128i*)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { for (j = 0; j < width - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(src0 + j)); S2 = _mm_loadu_si128((const __m128i*)(src1 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i*)(dst + j), D); } S1 = _mm_loadu_si128((const __m128i*)(src0 + j)); S2 = _mm_loadu_si128((const __m128i*)(src1 + j)); D = _mm_avg_epu8(S1, S2); _mm_maskmoveu_si128(D, mask, (char*)(dst + j)); src0 += i_src0; src1 += i_src1; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { S1 = _mm_loadu_si128((const __m128i*)(src0 + j)); S2 = _mm_loadu_si128((const __m128i*)(src1 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i*)(dst + j), D); } src0 += i_src0; src1 += i_src1; dst += i_dst; } } #endif } /* --------------------------------------------------------------------------- */ void *davs2_memzero_aligned_c_sse2(void *dst, size_t n) { __m128i *p_dst = (__m128i *)dst; __m128i m0 = _mm_setzero_si128(); int i = (int)(n >> 4); for (; i != 0; i--) { _mm_store_si128(p_dst, m0); p_dst++; } return dst; } /* --------------------------------------------------------------------------- */ void *davs2_memcpy_aligned_c_sse2(void *dst, const void *src, size_t n) { __m128i *p_dst = (__m128i *)dst; const __m128i *p_src = (const __m128i *)src; int i = (int)(n >> 4); for (; i != 0; i--) { _mm_store_si128(p_dst, _mm_load_si128(p_src)); p_src++; p_dst++; } return dst; } /* --------------------------------------------------------------------------- */ void plane_copy_c_sse2(pel_t *dst, intptr_t i_dst, pel_t *src, intptr_t i_src, int w, int h) { const int n128 = (w * sizeof(pel_t)) >> 4; int n_left = (w * sizeof(pel_t)) - (n128 << 4); if (n_left) { int n_offset = (n128 << 4); while (h--) { const __m128i *p_src = (const __m128i *)src; __m128i *p_dst = (__m128i *)dst; int n = n128; for (; n != 0; n--) { _mm_storeu_si128(p_dst, _mm_loadu_si128(p_src)); p_dst++; p_src++; } memcpy((uint8_t *)(dst) + n_offset, (uint8_t *)(src) + n_offset, n_left); dst += i_dst; src += i_src; } } else { while (h--) { const __m128i *p_src = (const __m128i *)src; __m128i *p_dst = (__m128i *)dst; int n = n128; for (; n != 0; n--) { _mm_storeu_si128(p_dst, _mm_loadu_si128(p_src)); p_dst++; p_src++; } dst += i_dst; src += i_src; } } } davs2-1.6/source/common/vec/intrinsic_pixel_avx.cc000066400000000000000000000401631337322544400223470ustar00rootroot00000000000000/* * intrinsic_pixel_avx.cc * * Description of this file: * AVX2 assembly functions of Pixel-Processing module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include /* --------------------------------------------------------------------------- */ void *davs2_memzero_aligned_c_avx(void *dst, size_t n) { __m256i *p_dst = (__m256i *)dst; __m256i m0 = _mm256_setzero_si256(); int i = (int)(n >> 5); for (; i != 0; i--) { _mm256_store_si256(p_dst, m0); p_dst++; } return dst; } #if _MSC_VER #if !HIGH_BIT_DEPTH void padding_rows_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; int pad_lr = pad + 16 - (pad & 0xF); start = max(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; // left & right for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi16((int16_t)p[0]); __m256i Val2 = _mm256_set1_epi16((int16_t)p[width - 1]); p1 = p - pad_lr; p2 = p + width; for (j = 0; j < pad_lr; j += 16) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } if (start == 0) { p = src - pad; for (i = 1; i <= pad; i++) { memcpy(p - i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } if (start + rows == height) { p = src + i_src * (height - 1) - pad; for (i = 1; i <= pad; i++) { memcpy(p + i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } } void padding_rows_lr_sse256_10bit(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; int pad_lr = pad + 16 - (pad & 0xF); start = max(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; // left & right for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi16((int16_t)p[0]); __m256i Val2 = _mm256_set1_epi16((int16_t)p[width - 1]); p1 = p - pad_lr; p2 = p + width; for (j = 0; j < pad_lr; j += 16) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } } #endif void add_pel_clip_sse256(const pel_t *src1, int i_src1, const coeff_t *src2, int i_src2, pel_t *dst, int i_dst, int width, int height) { #if !HIGH_BIT_DEPTH int i, j; __m256i mask; __m128i mask1; if (width >= 32) { __m256i S, R1, R2, S1, S2, D; __m256i zero = _mm256_setzero_si256(); mask = _mm256_load_si256((const __m256i *)intrinsic_mask32[(width & 31)]); for (i = 0; i < height; i++) { S = _mm256_loadu_si256((const __m256i *)(src1)); R1 = _mm256_loadu_si256((const __m256i *)(src2)); R2 = _mm256_loadu_si256((const __m256i *)(src2 + 16)); S = _mm256_permute4x64_epi64(S, 0xd8); S1 = _mm256_unpacklo_epi8(S, zero); S2 = _mm256_unpackhi_epi8(S, zero); S1 = _mm256_add_epi16(R1, S1); S2 = _mm256_add_epi16(R2, S2); D = _mm256_packus_epi16(S1, S2); D = _mm256_permute4x64_epi64(D, 0xd8); _mm256_storeu_si256((__m256i *)(dst), D); if (width > 32) { S = _mm256_loadu_si256((const __m256i *)(src1 + 32)); R1 = _mm256_loadu_si256((const __m256i *)(src2 + 32)); R2 = _mm256_loadu_si256((const __m256i *)(src2 + 48)); S = _mm256_permute4x64_epi64(S, 0xd8); S1 = _mm256_unpacklo_epi8(S, zero); S2 = _mm256_unpackhi_epi8(S, zero); S1 = _mm256_add_epi16(R1, S1); S2 = _mm256_add_epi16(R2, S2); D = _mm256_packus_epi16(S1, S2); D = _mm256_permute4x64_epi64(D, 0xd8); _mm256_maskstore_epi32((int *)(dst + 32), mask, D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { __m128i zero = _mm_setzero_si128(); __m128i S, S1, S2, R1, R2, D; if (width & 15) { mask1 = _mm_load_si128((const __m128i *)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { for (j = 0; j < width - 15; j += 16) { S = _mm_load_si128((const __m128i *)(src1 + j)); R1 = _mm_load_si128((const __m128i *)(src2 + j)); R2 = _mm_load_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_store_si128((__m128i *)(dst + j), D); } S = _mm_loadu_si128((const __m128i *)(src1 + j)); R1 = _mm_loadu_si128((const __m128i *)(src2 + j)); R2 = _mm_loadu_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_maskmoveu_si128(D, mask1, (char *)&dst[j]); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { S = _mm_load_si128((const __m128i *)(src1 + j)); R1 = _mm_load_si128((const __m128i *)(src2 + j)); R2 = _mm_load_si128((const __m128i *)(src2 + j + 8)); S1 = _mm_unpacklo_epi8(S, zero); S2 = _mm_unpackhi_epi8(S, zero); S1 = _mm_add_epi16(R1, S1); S2 = _mm_add_epi16(R2, S2); D = _mm_packus_epi16(S1, S2); _mm_store_si128((__m128i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } } #else int j; __m256i zero = _mm256_setzero_si256(); __m256i D; __m256i max_val = _mm256_set1_epi16((short)(max_pel_value)); if (width & 15) { __m256i mask = _mm256_loadu_si256((const __m256i *)intrinsic_mask_10bit[(width & 15) - 1]); while (height--) { for (j = 0; j < width - 15; j += 16) { D = _mm256_add_epi16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); D = _mm256_min_epi16(D, max_val); D = _mm256_max_epi16(D, zero); _mm256_storeu_si256((__m256i *)(dst + j), D); } D = _mm256_add_epi16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); D = _mm256_min_epi16(D, max_val); D = _mm256_max_epi16(D, zero); _mm256_maskstore_epi32((int *)&dst[j], mask, D); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { while (height--) { for (j = 0; j < width - 15; j += 16) { D = _mm256_add_epi16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); D = _mm256_min_epi16(D, max_val); D = _mm256_max_epi16(D, zero); _mm256_storeu_si256((__m256i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } #endif } void davs2_pixel_average_avx(pel_t *dst, int i_dst, const pel_t *src1, int i_src1, const pel_t *src2, int i_src2, int width, int height) { #if HIGH_BIT_DEPTH int j; if (width & 15) { __m256i mask = _mm256_loadu_si256((const __m256i *)intrinsic_mask_10bit[(width & 15) - 1]); while (height--) { __m256i D; for (j = 0; j < width - 15; j += 16) { D = _mm256_avg_epu16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); _mm256_storeu_si256((__m256i *)(dst + j), D); } D = _mm256_avg_epu16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); _mm256_maskstore_epi32((int *)&dst[j], mask, D); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { while (height--) { for (j = 0; j < width - 15; j += 16) { __m256i D = _mm256_avg_epu16(_mm256_loadu_si256((const __m256i *)(src1 + j)), _mm256_loadu_si256((const __m256i *)(src2 + j))); _mm256_storeu_si256((__m256i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } #else int i; if (width >= 32) { __m256i mask = _mm256_load_si256((const __m256i *)intrinsic_mask32[(width & 31)]); for (i = 0; i < height; i++) { __m256i S1 = _mm256_loadu_si256((const __m256i *)(src1)); __m256i S2 = _mm256_load_si256((const __m256i *)(src2)); __m256i D = _mm256_avg_epu8(S1, S2); _mm256_storeu_si256((__m256i *)(dst), D); if (32 < width) { S1 = _mm256_loadu_si256((const __m256i *)(src1 + 32)); S2 = _mm256_load_si256((const __m256i *)(src2 + 32)); D = _mm256_avg_epu8(S1, S2); _mm256_maskstore_epi32((int *)(dst + 32), mask, D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { int i, j; if (width & 15) { __m128i mask = _mm_load_si128((const __m128i *)intrinsic_mask[(width & 15) - 1]); for (i = 0; i < height; i++) { __m128i S1, S2, D; for (j = 0; j < width - 15; j += 16) { S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); S2 = _mm_load_si128((const __m128i *)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i *)(dst + j), D); } S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); S2 = _mm_load_si128((const __m128i *)(src2 + j)); D = _mm_avg_epu8(S1, S2); _mm_maskmoveu_si128(D, mask, (char *)&dst[j]); src1 += i_src1; src2 += i_src2; dst += i_dst; } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j += 16) { __m128i S1 = _mm_loadu_si128((const __m128i *)(src1 + j)); __m128i S2 = _mm_load_si128((const __m128i *)(src2 + j)); __m128i D = _mm_avg_epu8(S1, S2); _mm_storeu_si128((__m128i *)(dst + j), D); } src1 += i_src1; src2 += i_src2; dst += i_dst; } } } #endif } #if !HIGH_BIT_DEPTH void padding_rows_lr_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; start = max(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; pad = pad + 16 - (pad & 0xF); if (pad & 0x1f) { __m256i mask = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad - 31; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } _mm256_maskstore_epi32((int *)(p1 + j), mask, Val1); _mm256_maskstore_epi32((int *)(p2 + j), mask, Val2); p += i_src; } } else { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } } void padding_rows_sse256(pel_t *src, int i_src, int width, int height, int start, int rows, int pad) { int i, j; pel_t *p, *p1, *p2; start = max(start, 0); if (start + rows > height) { rows = height - start; } p = src + start * i_src; pad = pad + 16 - (pad & 0xF); if (pad & 0x1f) { __m256i mask = _mm256_setr_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); for (i = 0; i < rows; i++) { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad - 31; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } _mm256_maskstore_epi32((int *)(p1 + j), mask, Val1); _mm256_maskstore_epi32((int *)(p2 + j), mask, Val2); p += i_src; } } else { __m256i Val1 = _mm256_set1_epi8((char)p[0]); __m256i Val2 = _mm256_set1_epi8((char)p[width - 1]); p1 = p - pad; p2 = p + width; for (j = 0; j < pad; j += 32) { _mm256_storeu_si256((__m256i *)(p1 + j), Val1); _mm256_storeu_si256((__m256i *)(p2 + j), Val2); } p += i_src; } if (start == 0) { p = src - pad; for (i = 1; i <= pad; i++) { memcpy(p - i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } if (start + rows == height) { p = src + i_src * (height - 1) - pad; for (i = 1; i <= pad; i++) { memcpy(p + i_src * i, p, (width + 2 * pad) * sizeof(pel_t)); } } } #endif #endif // #if _MSC_VER davs2-1.6/source/common/vec/intrinsic_sao.cc000066400000000000000000000636711337322544400211430ustar00rootroot00000000000000/* * intrinsic_sao.cc * * Description of this file: * SSE assembly functions of SAO module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #ifdef _MSC_VER #pragma warning(disable:4244) // TODO: warning #endif #if !HIGH_BIT_DEPTH /* --------------------------------------------------------------------------- * lcu neighbor */ enum lcu_neighbor_e { SAO_T = 0, /* top */ SAO_D = 1, /* down */ SAO_L = 2, /* left */ SAO_R = 3, /* right */ SAO_TL = 4, /* top-left */ SAO_TR = 5, /* top-right */ SAO_DL = 6, /* down-left */ SAO_DR = 7 /* down-right */ }; /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_0_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src,int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask; int x, y; UNUSED_PARAMETER(bit_depth); __m128i clipMin = _mm_setzero_si128(); int end_x_16; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_offset[0]); off1 = _mm_set1_epi8((int8_t)sao_offset[1]); off2 = _mm_set1_epi8((int8_t)sao_offset[2]); off3 = _mm_set1_epi8((int8_t)sao_offset[3]); off4 = _mm_set1_epi8((int8_t)sao_offset[4]); int start_x = lcu_avail[SAO_L] ? 0 : 1; int end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_16 = end_x - ((end_x - start_x) & 0x0f); for (y = 0; y < i_block_h; y++) { for (x = start_x; x < end_x; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //rightsign etype = _mm_adds_epi8(t0, t3); //edgetype=leftsign+rightsign t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); if (x != end_x_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_16 - 1])); _mm_maskmoveu_si128(t0, mask, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_90_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask; int x, y; UNUSED_PARAMETER(bit_depth); __m128i clipMin = _mm_setzero_si128(); int end_x_16 = i_block_w - 15; c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_offset[0]); off1 = _mm_set1_epi8((int8_t)sao_offset[1]); off2 = _mm_set1_epi8((int8_t)sao_offset[2]); off3 = _mm_set1_epi8((int8_t)sao_offset[3]); off4 = _mm_set1_epi8((int8_t)sao_offset[4]); int start_y = lcu_avail[SAO_T] ? 0 : 1; int end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); p_dst += start_y * i_dst; p_src += start_y * i_src; for (y = start_y; y < end_y; y++) { for (x = 0; x < i_block_w; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x < end_x_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[(i_block_w & 15) - 1])); _mm_maskmoveu_si128(t0, mask, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_135_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask_r0, mask_r, mask_rn; int x, y; __m128i clipMin = _mm_setzero_si128(); int end_x_r0_16, end_x_r_16, end_x_rn_16; UNUSED_PARAMETER(bit_depth); c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_offset[0]); off1 = _mm_set1_epi8((int8_t)sao_offset[1]); off2 = _mm_set1_epi8((int8_t)sao_offset[2]); off3 = _mm_set1_epi8((int8_t)sao_offset[3]); off4 = _mm_set1_epi8((int8_t)sao_offset[4]); //first row int start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1; int end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_r0_16 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x0f); for (x = start_x_r0; x < end_x_r0; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r0_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_16 - 1])); _mm_maskmoveu_si128(t0, mask_r0, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; //middle rows int start_x_r = lcu_avail[SAO_L] ? 0 : 1; int end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_16 = end_x_r - ((end_x_r - start_x_r) & 0x0f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_16 - 1])); _mm_maskmoveu_si128(t0, mask_r, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } //last row int start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); int end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); end_x_rn_16 = end_x_rn - ((end_x_rn - start_x_rn) & 0x0f); for (x = start_x_rn; x < end_x_rn; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src - 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src + 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_rn_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_16 - 1])); _mm_maskmoveu_si128(t0, mask_rn, (char*)(p_dst + x)); break; } } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_45_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { __m128i off0, off1, off2, off3, off4; __m128i s0, s1, s2; __m128i t0, t1, t2, t3, t4, etype; __m128i c0, c1, c2, c3, c4; __m128i mask_r0, mask_r, mask_rn; int x, y; __m128i clipMin = _mm_setzero_si128(); int end_x_r0_16, end_x_r_16, end_x_rn_16; UNUSED_PARAMETER(bit_depth); c0 = _mm_set1_epi8(-2); c1 = _mm_set1_epi8(-1); c2 = _mm_set1_epi8(0); c3 = _mm_set1_epi8(1); c4 = _mm_set1_epi8(2); off0 = _mm_set1_epi8((int8_t)sao_offset[0]); off1 = _mm_set1_epi8((int8_t)sao_offset[1]); off2 = _mm_set1_epi8((int8_t)sao_offset[2]); off3 = _mm_set1_epi8((int8_t)sao_offset[3]); off4 = _mm_set1_epi8((int8_t)sao_offset[4]); //first row int start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); int end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); end_x_r0_16 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x0f); for (x = start_x_r0; x < end_x_r0; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r0_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r0 = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_16 - 1])); _mm_maskmoveu_si128(t0, mask_r0, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; //middle rows int start_x_r = lcu_avail[SAO_L] ? 0 : 1; int end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_16 = end_x_r - ((end_x_r - start_x_r) & 0x0f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_r_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_r = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_16 - 1])); _mm_maskmoveu_si128(t0, mask_r, (char*)(p_dst + x)); break; } } p_dst += i_dst; p_src += i_src; } //last row int start_x_rn = lcu_avail[SAO_DL] ? 0 : 1; int end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_rn_16 = end_x_rn - ((end_x_rn - start_x_rn) & 0x0f); for (x = start_x_rn; x < end_x_rn; x += 16) { s0 = _mm_loadu_si128((__m128i*)&p_src[x - i_src + 1]); s1 = _mm_loadu_si128((__m128i*)&p_src[x]); s2 = _mm_loadu_si128((__m128i*)&p_src[x + i_src - 1]); t3 = _mm_min_epu8(s0, s1); t1 = _mm_cmpeq_epi8(t3, s0); t2 = _mm_cmpeq_epi8(t3, s1); t0 = _mm_subs_epi8(t2, t1); //upsign t3 = _mm_min_epu8(s1, s2); t1 = _mm_cmpeq_epi8(t3, s1); t2 = _mm_cmpeq_epi8(t3, s2); t3 = _mm_subs_epi8(t1, t2); //downsign etype = _mm_adds_epi8(t0, t3); //edgetype t0 = _mm_cmpeq_epi8(etype, c0); t1 = _mm_cmpeq_epi8(etype, c1); t2 = _mm_cmpeq_epi8(etype, c2); t3 = _mm_cmpeq_epi8(etype, c3); t4 = _mm_cmpeq_epi8(etype, c4); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t4 = _mm_and_si128(t4, off4); t0 = _mm_adds_epi8(t0, t1); t2 = _mm_adds_epi8(t2, t3); t0 = _mm_adds_epi8(t0, t4); t0 = _mm_adds_epi8(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(s1, clipMin); t4 = _mm_unpackhi_epi8(s1, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); t0 = _mm_packus_epi16(t1, t2); //saturated if (x != end_x_rn_16) { _mm_storeu_si128((__m128i*)(p_dst + x), t0); } else { mask_rn = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_16 - 1])); _mm_maskmoveu_si128(t0, mask_rn, (char*)(p_dst + x)); break; } } } /* --------------------------------------------------------------------------- */ void SAO_on_block_bo_sse128(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param) { __m128i r0, r1, r2, r3, off0, off1, off2, off3; __m128i t0, t1, t2, t3; __m128i mask; int shift_bo = g_bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; int x, y; __m128i src0, src1; __m128i shift_mask = _mm_set1_epi8(31); __m128i clipMin = _mm_setzero_si128(); int end_x_16 = i_block_w - 15; UNUSED_PARAMETER(bit_depth); r0 = _mm_set1_epi8((int8_t)(sao_param->startBand)); r1 = _mm_set1_epi8((int8_t)((sao_param->startBand + 1) & 31)); r2 = _mm_set1_epi8((int8_t)(sao_param->startBand2)); r3 = _mm_set1_epi8((int8_t)((sao_param->startBand2 + 1) & 31)); off0 = _mm_set1_epi8((int8_t)sao_param->offset[sao_param->startBand]); off1 = _mm_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand + 1) & 31]); off2 = _mm_set1_epi8((int8_t)sao_param->offset[sao_param->startBand2]); off3 = _mm_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand2 + 1) & 31]); for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x += 16) { __m128i t4; src0 = _mm_loadu_si128((__m128i*)&p_src[x]); src1 = _mm_and_si128(_mm_srai_epi16(src0, shift_bo), shift_mask); t0 = _mm_cmpeq_epi8(src1, r0); t1 = _mm_cmpeq_epi8(src1, r1); t2 = _mm_cmpeq_epi8(src1, r2); t3 = _mm_cmpeq_epi8(src1, r3); t0 = _mm_and_si128(t0, off0); t1 = _mm_and_si128(t1, off1); t2 = _mm_and_si128(t2, off2); t3 = _mm_and_si128(t3, off3); t0 = _mm_or_si128(t0, t1); t2 = _mm_or_si128(t2, t3); t0 = _mm_or_si128(t0, t2);//get offset //add 8 nums once for possible overflow t1 = _mm_cvtepi8_epi16(t0); t0 = _mm_srli_si128(t0, 8); t2 = _mm_cvtepi8_epi16(t0); t3 = _mm_unpacklo_epi8(src0, clipMin); t4 = _mm_unpackhi_epi8(src0, clipMin); t1 = _mm_adds_epi16(t1, t3); t2 = _mm_adds_epi16(t2, t4); src0 = _mm_packus_epi16(t1, t2); //saturated if (x < end_x_16) { _mm_storeu_si128((__m128i*)&p_dst[x], src0); } else { mask = _mm_load_si128((const __m128i*)intrinsic_mask[(i_block_w & 15) - 1]); _mm_maskmoveu_si128(src0, mask, (char*)(p_dst + x)); } } p_dst += i_dst; p_src += i_src; } } #endif // !HIGH_BIT_DEPTH davs2-1.6/source/common/vec/intrinsic_sao_avx2.cc000066400000000000000000000671011337322544400220730ustar00rootroot00000000000000/* * intrinsic_sao_avx2.cc * * Description of this file: * AVX2 assembly functions of SAO module of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #include "../common.h" #include "intrinsic.h" #include #include #include #include #include #if !HIGH_BIT_DEPTH #ifdef _MSC_VER #pragma warning(disable:4244) // TODO: warning #endif /* --------------------------------------------------------------------------- * lcu neighbor */ enum lcu_neighbor_e { SAO_T = 0, /* top */ SAO_D = 1, /* down */ SAO_L = 2, /* left */ SAO_R = 3, /* right */ SAO_TL = 4, /* top-left */ SAO_TR = 5, /* top-right */ SAO_DL = 6, /* down-left */ SAO_DR = 7 /* down-right */ }; /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_0_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int x, y; __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; int start_x = lcu_avail[SAO_L] ? 0 : 1; int end_x = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); int end_x_32; __m256i c2 = _mm256_set1_epi8(2); UNUSED_PARAMETER(bit_depth); offtmp = _mm_loadu_si128((__m128i*)sao_offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); end_x_32 = end_x - ((end_x - start_x) & 0x1f); for (y = 0; y < i_block_h; y++) { for (x = start_x; x < end_x; x += 32) { s0 = _mm256_lddqu_si256((__m256i*)&p_src[x - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //leftsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //rightsign etype = _mm256_adds_epi8(t0, t3); etype = _mm256_adds_epi8(etype, c2);//edgetype=left + right +2 t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_90_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int start_y, end_y; int x, y; __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_32 = i_block_w - (i_block_w & 0x1f); UNUSED_PARAMETER(bit_depth); offtmp = _mm_loadu_si128((__m128i*)sao_offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); start_y = lcu_avail[SAO_T] ? 0 : 1; end_y = lcu_avail[SAO_D] ? i_block_h : (i_block_h - 1); p_dst += start_y * i_dst; p_src += start_y * i_src; for (y = start_y; y < end_y; y++) { for (x = 0; x < i_block_w; x += 32) { s0 = _mm256_lddqu_si256((__m256i*)&p_src[x - i_src]); s1 = _mm256_lddqu_si256((__m256i*)&p_src[x]); s2 = _mm256_lddqu_si256((__m256i*)&p_src[x + i_src]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //leftsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //rightsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (i_block_w - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (i_block_w - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[i_block_w - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[i_block_w - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_135_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_r0_32, end_x_r_32, end_x_rn_32; UNUSED_PARAMETER(bit_depth); offtmp = _mm_loadu_si128((__m128i*)sao_offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); //first row start_x_r0 = lcu_avail[SAO_TL] ? 0 : 1; end_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_r0_32 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x1f); for (x = start_x_r0; x < end_x_r0; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r0_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r0 - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r0 - x > 16) { mask = _mm_loadu_si128((__m128i*)intrinsic_mask[end_x_r0 - end_x_r0_32 - 17]); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_32 = end_x_r - ((end_x_r - start_x_r) & 0x1f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_rn = lcu_avail[SAO_DR] ? i_block_w : (i_block_w - 1); end_x_rn_32 = end_x_rn - ((end_x_rn - start_x_rn) & 0x1f); for (x = start_x_rn; x < end_x_rn; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src - 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src + 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_rn_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_rn - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_rn - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } } /* --------------------------------------------------------------------------- */ void SAO_on_block_eo_45_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const int *lcu_avail, const int *sao_offset) { int start_x_r0, end_x_r0, start_x_r, end_x_r, start_x_rn, end_x_rn; int x, y; __m256i off; __m256i s0, s1, s2; __m256i t0, t1, t2, t3, t4, etype; __m128i mask, offtmp; __m256i c2 = _mm256_set1_epi8(2); int end_x_r0_32, end_x_r_32, end_x_rn_32; UNUSED_PARAMETER(bit_depth); offtmp = _mm_loadu_si128((__m128i*)sao_offset); offtmp = _mm_packs_epi32(offtmp, _mm_set_epi32(0, 0, 0, sao_offset[4])); offtmp = _mm_packs_epi16(offtmp, _mm_setzero_si128()); off = _mm256_castsi128_si256(offtmp); off = _mm256_inserti128_si256(off, offtmp, 1); start_x_r0 = lcu_avail[SAO_T] ? (lcu_avail[SAO_L] ? 0 : 1) : (i_block_w - 1); end_x_r0 = lcu_avail[SAO_TR] ? i_block_w : (i_block_w - 1); end_x_r0_32 = end_x_r0 - ((end_x_r0 - start_x_r0) & 0x1f); //first row for (x = start_x_r0; x < end_x_r0; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r0_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r0 - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r0 - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r0 - end_x_r0_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; //middle rows start_x_r = lcu_avail[SAO_L] ? 0 : 1; end_x_r = lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1); end_x_r_32 = end_x_r - ((end_x_r - start_x_r) & 0x1f); for (y = 1; y < i_block_h - 1; y++) { for (x = start_x_r; x < end_x_r; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_r_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_r - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_r - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_r - end_x_r_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } //last row start_x_rn = lcu_avail[SAO_DL] ? 0 : 1; end_x_rn = lcu_avail[SAO_D] ? (lcu_avail[SAO_R] ? i_block_w : (i_block_w - 1)) : 1; end_x_rn_32 = end_x_rn - ((end_x_rn - start_x_rn) & 0x1f); for (x = start_x_rn; x < end_x_rn; x += 32) { s0 = _mm256_loadu_si256((__m256i*)&p_src[x - i_src + 1]); s1 = _mm256_loadu_si256((__m256i*)&p_src[x]); s2 = _mm256_loadu_si256((__m256i*)&p_src[x + i_src - 1]); t3 = _mm256_min_epu8(s0, s1); t1 = _mm256_cmpeq_epi8(t3, s0); t2 = _mm256_cmpeq_epi8(t3, s1); t0 = _mm256_subs_epi8(t2, t1); //upsign t3 = _mm256_min_epu8(s1, s2); t1 = _mm256_cmpeq_epi8(t3, s1); t2 = _mm256_cmpeq_epi8(t3, s2); t3 = _mm256_subs_epi8(t1, t2); //downsign etype = _mm256_adds_epi8(t0, t3); //edgetype etype = _mm256_adds_epi8(etype, c2); t0 = _mm256_shuffle_epi8(off, etype);//get offset //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); t1 = _mm256_adds_epi16(t1, t3); t2 = _mm256_adds_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x != end_x_rn_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x_rn - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x_rn - x > 16) { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x_rn - end_x_rn_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } } /* --------------------------------------------------------------------------- */ void SAO_on_block_bo_avx2(pel_t *p_dst, int i_dst, const pel_t *p_src, int i_src, int i_block_w, int i_block_h, int bit_depth, const sao_param_t *sao_param) { __m256i r0, r1, r2, r3, off0, off1, off2, off3; __m256i t0, t1, t2, t3, t4, src0, src1; __m128i mask = _mm_setzero_si128(); int x, y; int shift_bo = bit_depth - NUM_SAO_BO_CLASSES_IN_BIT; __m256i shift_mask = _mm256_set1_epi8(31); int end_x = i_block_w; int end_x_32 = end_x - ((end_x - 0) & 0x1f); UNUSED_PARAMETER(bit_depth); r0 = _mm256_set1_epi8((int8_t)(sao_param->startBand)); r1 = _mm256_set1_epi8((int8_t)((sao_param->startBand + 1) & 31)); r2 = _mm256_set1_epi8((int8_t)(sao_param->startBand2)); r3 = _mm256_set1_epi8((int8_t)((sao_param->startBand2 + 1) & 31)); off0 = _mm256_set1_epi8((int8_t)sao_param->offset[sao_param->startBand]); off1 = _mm256_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand + 1) & 31]); off2 = _mm256_set1_epi8((int8_t)sao_param->offset[sao_param->startBand2]); off3 = _mm256_set1_epi8((int8_t)sao_param->offset[(sao_param->startBand2 + 1) & 31]); for (y = 0; y < i_block_h; y++) { for (x = 0; x < i_block_w; x += 32){ src0 = _mm256_loadu_si256((__m256i*)&p_src[x]); src1 = _mm256_srli_epi16(src0, shift_bo); src1 = _mm256_and_si256(src1, shift_mask); t0 = _mm256_cmpeq_epi8(src1, r0); t1 = _mm256_cmpeq_epi8(src1, r1); t2 = _mm256_cmpeq_epi8(src1, r2); t3 = _mm256_cmpeq_epi8(src1, r3); t0 = _mm256_and_si256(t0, off0); t1 = _mm256_and_si256(t1, off1); t2 = _mm256_and_si256(t2, off2); t3 = _mm256_and_si256(t3, off3); t0 = _mm256_or_si256(t0, t1); t2 = _mm256_or_si256(t2, t3); t0 = _mm256_or_si256(t0, t2); //convert byte to short for possible overflow t1 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(t0)); t2 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(t0, 1)); t3 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(src0)); t4 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(src0, 1)); t1 = _mm256_add_epi16(t1, t3); t2 = _mm256_add_epi16(t2, t4); t0 = _mm256_packus_epi16(t1, t2); //saturated t0 = _mm256_permute4x64_epi64(t0, 0xd8); if (x < end_x_32) { _mm256_storeu_si256((__m256i*)(p_dst + x), t0); } else { if (end_x - x >= 16) { _mm_storeu_si128((__m128i*)(p_dst + x), _mm256_castsi256_si128(t0)); if (end_x - x > 16) { mask = _mm_loadu_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 17])); _mm_maskmoveu_si128(_mm256_extracti128_si256(t0, 1), mask, (char*)(p_dst + x + 16)); } } else { mask = _mm_load_si128((__m128i*)(intrinsic_mask[end_x - end_x_32 - 1])); _mm_maskmoveu_si128(_mm256_castsi256_si128(t0), mask, (char*)(p_dst + x)); } break; } } p_dst += i_dst; p_src += i_src; } } #endif // !HIGH_BIT_DEPTH davs2-1.6/source/common/vlc.h000066400000000000000000000154641337322544400161450ustar00rootroot00000000000000/* * vlc.h * * Description of this file: * VLC functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video encoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_VLC_H #define DAVS2_VLC_H #ifdef __cplusplus extern "C" { #endif /* --------------------------------------------------------------------------- * reads bits from the bitstream buffer * Input: * p_buf - containing VLC-coded data bits * i_bit_pos - bit offset from start of partition * i_buf - total bytes in bitstream * i_bits - number of bits to read * return 0 for success, otherwise failure */ static INLINE int read_bits(uint8_t *p_buf, int i_buf, int i_bit_pos, int *p_info, int i_bits) { int byte_offset = i_bit_pos >> 3; // byte from start of buffer int bit_offset = 7 - (i_bit_pos & 7); // bit from start of byte int inf = 0; while (i_bits--) { inf <<= 1; inf |= (p_buf[byte_offset] & (1 << bit_offset)) >> bit_offset; bit_offset--; if (bit_offset < 0) { byte_offset++; bit_offset += 8; if (byte_offset > i_buf) { return -1; /* error */ } } } *p_info = inf; return 0; } /* --------------------------------------------------------------------------- * RETURN: the length of symbol, or -1 on error */ static INLINE int get_vlc_symbol(uint8_t *p_buf, int i_bit_pos, int *info, int i_buf) { int byte_offset = i_bit_pos >> 3; // byte from start of buffer int bit_offset = 7 - (i_bit_pos & 7); // bit from start of byte int bit_counter = 1; int len = 1; int ctr_bit; // control bit for current bit position int info_bit; int inf; ctr_bit = (p_buf[byte_offset] & (1 << bit_offset)); // set up control bit while (ctr_bit == 0) { // find leading 1 bit len++; bit_offset -= 1; bit_counter++; if (bit_offset < 0) { // finish with current byte ? bit_offset = bit_offset + 8; byte_offset++; } ctr_bit = (p_buf[byte_offset] & (1 << bit_offset)); // set up control bit } // make info-word inf = 0; // shortest possible code is 1, then info is always 0 for (info_bit = 0; (info_bit < (len - 1)); info_bit++) { bit_counter++; bit_offset--; if (bit_offset < 0) { // finished with current byte ? bit_offset = bit_offset + 8; byte_offset++; } if (byte_offset > i_buf) { return -1; /* error */ } inf = (inf << 1); if (p_buf[byte_offset] & (0x01 << (bit_offset))) { inf |= 1; } } *info = inf; // return absolute offset in bit from start of frame return bit_counter; } /* --------------------------------------------------------------------------- * reads an u(v) syntax element (FLC codeword) from UVLC-partition * RETURN: the value of the coded syntax element, or -1 on error */ static INLINE int vlc_u_v(davs2_bs_t *bs, int i_bits #if AVS2_TRACE , char *tracestring #endif ) { int ret_val = 0; if (read_bits(bs->p_stream, bs->i_stream, bs->i_bit_pos, &ret_val, i_bits) == 0) { bs->i_bit_pos += i_bits; /* move bitstream pointer */ #if AVS2_TRACE avs2_trace_string(tracestring, ret_val, i_bits); #endif return ret_val; } return -1; } /* --------------------------------------------------------------------------- * reads an ue(v) syntax element * RETURN: the value of the coded syntax element, or -1 on error */ static INLINE int vlc_ue_v(davs2_bs_t *bs #if AVS2_TRACE , char *tracestring #endif ) { int len, info; int ret_val; len = get_vlc_symbol(bs->p_stream, bs->i_bit_pos, &info, bs->i_stream); if (len == -1) { return -1; /* error */ } bs->i_bit_pos += len; // cal: pow(2, (len / 2)) + info - 1; ret_val = (1 << (len >> 1)) + info - 1; #if AVS2_TRACE avs2_trace_string2(tracestring, ret_val + 1, ret_val, len); #endif return ret_val; } /* --------------------------------------------------------------------------- * reads an se(v) syntax element * RETURN: the value of the coded syntax element, or -1 on error */ static INLINE int vlc_se_v(davs2_bs_t *bs #if AVS2_TRACE , char *tracestring #endif ) { int len, info; int ret_val; int n; len = get_vlc_symbol(bs->p_stream, bs->i_bit_pos, &info, bs->i_stream); if (len == -1) { return -1; /* error */ } bs->i_bit_pos += len; // cal: (int)pow(2, (len / 2)) + info - 1; n = (1 << (len >> 1)) + info - 1; ret_val = (n + 1) >> 1; if ((n & 1) == 0) { /* lsb is signed bit */ ret_val = -ret_val; } #if AVS2_TRACE avs2_trace_string2(tracestring, n + 1, ret_val, len); #endif return ret_val; } #if AVS2_TRACE #define u_flag(bs, tracestring) (bool_t)vlc_u_v(bs, 1, tracestring) #define u_v(bs, i_bits, tracestring) vlc_u_v(bs, i_bits, tracestring) #define ue_v(bs, tracestring) vlc_ue_v(bs, tracestring) #define se_v(bs, tracestring) vlc_se_v(bs, tracestring) #else #define u_flag(bs, tracestring) (bool_t)vlc_u_v(bs, 1) #define u_v(bs, i_bits, tracestring) vlc_u_v(bs, i_bits) #define ue_v(bs, tracestring) vlc_ue_v(bs) #define se_v(bs, tracestring) vlc_se_v(bs) #endif #ifdef __cplusplus } #endif #endif // DAVS2_VLC_H davs2-1.6/source/common/win32thread.cc000066400000000000000000000277051337322544400176520ustar00rootroot00000000000000/***************************************************************************** * win32thread.c: windows threading ***************************************************************************** * Copyright (C) 2010-2017 x264 project * * Authors: Steven Walters * Pegasys Inc. * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* * changes of this file: * modified for davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. */ /* Microsoft's way of supporting systems with >64 logical cpus can be found at * http://www.microsoft.com/whdc/system/Sysinternals/MoreThan64proc.mspx */ /* Based on the agreed standing that davs2 decoder does not need to utilize >64 logical cpus, * this API does not detect nor utilize more than 64 cpus for systems that have them. */ #include "common.h" #if HAVE_WIN32THREAD #include /** * =========================================================================== * type defines * =========================================================================== */ /* number of times to spin a thread about to block on a locked mutex before retrying and sleeping if still locked */ #define XAVS2_SPIN_COUNT 0 /* GROUP_AFFINITY struct */ typedef struct { ULONG_PTR mask; // KAFFINITY = ULONG_PTR USHORT group; USHORT reserved[3]; } davs2_group_affinity_t; typedef void (WINAPI *cond_func_t)(davs2_thread_cond_t *cond); typedef BOOL (WINAPI *cond_wait_t)(davs2_thread_cond_t *cond, davs2_thread_mutex_t *mutex, DWORD milliseconds); typedef struct { /* global mutex for replacing MUTEX_INITIALIZER instances */ davs2_thread_mutex_t static_mutex; /* function pointers to conditional variable API on windows 6.0+ kernels */ cond_func_t cond_broadcast; cond_func_t cond_init; cond_func_t cond_signal; cond_wait_t cond_wait; } davs2_win32thread_control_t; static davs2_win32thread_control_t thread_control; /** * =========================================================================== * function defines * =========================================================================== */ /* _beginthreadex requires that the start routine is __stdcall */ static unsigned __stdcall davs2_win32thread_worker(void *arg) { davs2_thread_t *h = (davs2_thread_t *)arg; h->ret = h->func(h->arg); return 0; } int davs2_thread_create(davs2_thread_t *thread, const davs2_thread_attr_t *attr, void *(*start_routine)(void *), void *arg) { UNUSED_PARAMETER(attr); thread->func = start_routine; thread->arg = arg; thread->handle = (void *)_beginthreadex(NULL, 0, davs2_win32thread_worker, thread, 0, NULL); return !thread->handle; } int davs2_thread_join(davs2_thread_t thread, void **value_ptr) { DWORD ret = WaitForSingleObject(thread.handle, INFINITE); if (ret != WAIT_OBJECT_0) { return -1; } if (value_ptr) { *value_ptr = thread.ret; } CloseHandle(thread.handle); return 0; } int davs2_thread_mutex_init(davs2_thread_mutex_t *mutex, const davs2_thread_mutexattr_t *attr) { UNUSED_PARAMETER(attr); return !InitializeCriticalSectionAndSpinCount(mutex, XAVS2_SPIN_COUNT); } int davs2_thread_mutex_destroy(davs2_thread_mutex_t *mutex) { DeleteCriticalSection(mutex); return 0; } int davs2_thread_mutex_lock(davs2_thread_mutex_t *mutex) { static davs2_thread_mutex_t init = DAVS2_THREAD_MUTEX_INITIALIZER; if (!memcmp(mutex, &init, sizeof(davs2_thread_mutex_t))) { *mutex = thread_control.static_mutex; } EnterCriticalSection(mutex); return 0; } int davs2_thread_mutex_unlock(davs2_thread_mutex_t *mutex) { LeaveCriticalSection(mutex); return 0; } /* for pre-Windows 6.0 platforms we need to define and use our own condition variable and api */ typedef struct { davs2_thread_mutex_t mtx_broadcast; davs2_thread_mutex_t mtx_waiter_count; int waiter_count; HANDLE semaphore; HANDLE waiters_done; int is_broadcast; } davs2_win32_cond_t; int davs2_thread_cond_init(davs2_thread_cond_t *cond, const davs2_thread_condattr_t *attr) { davs2_win32_cond_t *win32_cond; UNUSED_PARAMETER(attr); if (thread_control.cond_init) { thread_control.cond_init(cond); return 0; } /* non native condition variables */ win32_cond = (davs2_win32_cond_t *)davs2_malloc(sizeof(davs2_win32_cond_t)); memset(win32_cond, 0, sizeof(davs2_win32_cond_t)); if (!win32_cond) { return -1; } cond->ptr = win32_cond; win32_cond->semaphore = CreateSemaphore(NULL, 0, 0x7fffffff, NULL); if (!win32_cond->semaphore) { return -1; } if (davs2_thread_mutex_init(&win32_cond->mtx_waiter_count, NULL)) { return -1; } if (davs2_thread_mutex_init(&win32_cond->mtx_broadcast, NULL)) { return -1; } win32_cond->waiters_done = CreateEvent(NULL, FALSE, FALSE, NULL); if (!win32_cond->waiters_done) { return -1; } return 0; } int davs2_thread_cond_destroy(davs2_thread_cond_t *cond) { davs2_win32_cond_t *win32_cond; /* native condition variables do not destroy */ if (thread_control.cond_init) { return 0; } /* non native condition variables */ win32_cond = (davs2_win32_cond_t *)cond->ptr; CloseHandle(win32_cond->semaphore); CloseHandle(win32_cond->waiters_done); davs2_thread_mutex_destroy(&win32_cond->mtx_broadcast); davs2_thread_mutex_destroy(&win32_cond->mtx_waiter_count); davs2_free(win32_cond); return 0; } int davs2_thread_cond_broadcast(davs2_thread_cond_t *cond) { davs2_win32_cond_t *win32_cond; int have_waiter = 0; if (thread_control.cond_broadcast) { thread_control.cond_broadcast(cond); return 0; } /* non native condition variables */ win32_cond = (davs2_win32_cond_t *)cond->ptr; davs2_thread_mutex_lock(&win32_cond->mtx_broadcast); davs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); if (win32_cond->waiter_count) { win32_cond->is_broadcast = 1; have_waiter = 1; } if (have_waiter) { ReleaseSemaphore(win32_cond->semaphore, win32_cond->waiter_count, NULL); davs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); WaitForSingleObject(win32_cond->waiters_done, INFINITE); win32_cond->is_broadcast = 0; } else { davs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); } return davs2_thread_mutex_unlock(&win32_cond->mtx_broadcast); } int davs2_thread_cond_signal(davs2_thread_cond_t *cond) { davs2_win32_cond_t *win32_cond; int have_waiter; if (thread_control.cond_signal) { thread_control.cond_signal(cond); return 0; } /* non-native condition variables */ win32_cond = (davs2_win32_cond_t *)cond->ptr; davs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); have_waiter = win32_cond->waiter_count; davs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); if (have_waiter) { ReleaseSemaphore(win32_cond->semaphore, 1, NULL); } return 0; } int davs2_thread_cond_wait(davs2_thread_cond_t *cond, davs2_thread_mutex_t *mutex) { davs2_win32_cond_t *win32_cond; int last_waiter; if (thread_control.cond_wait) { return !thread_control.cond_wait(cond, mutex, INFINITE); } /* non native condition variables */ win32_cond = (davs2_win32_cond_t *)cond->ptr; davs2_thread_mutex_lock(&win32_cond->mtx_broadcast); davs2_thread_mutex_unlock(&win32_cond->mtx_broadcast); davs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); win32_cond->waiter_count++; davs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); // unlock the external mutex davs2_thread_mutex_unlock(mutex); WaitForSingleObject(win32_cond->semaphore, INFINITE); davs2_thread_mutex_lock(&win32_cond->mtx_waiter_count); win32_cond->waiter_count--; last_waiter = !win32_cond->waiter_count && win32_cond->is_broadcast; davs2_thread_mutex_unlock(&win32_cond->mtx_waiter_count); if (last_waiter) { SetEvent(win32_cond->waiters_done); } // lock the external mutex return davs2_thread_mutex_lock(mutex); } int davs2_win32_threading_init(void) { /* find function pointers to API functions, if they exist */ HMODULE kernel_dll = GetModuleHandle(TEXT("kernel32")); thread_control.cond_init = (cond_func_t)GetProcAddress(kernel_dll, "InitializeConditionVariable"); if (thread_control.cond_init) { /* we're on a windows 6.0+ kernel, acquire the rest of the functions */ thread_control.cond_broadcast = (cond_func_t)GetProcAddress(kernel_dll, "WakeAllConditionVariable"); thread_control.cond_signal = (cond_func_t)GetProcAddress(kernel_dll, "WakeConditionVariable"); thread_control.cond_wait = (cond_wait_t)GetProcAddress(kernel_dll, "SleepConditionVariableCS"); } return davs2_thread_mutex_init(&thread_control.static_mutex, NULL); } void davs2_win32_threading_destroy(void) { davs2_thread_mutex_destroy(&thread_control.static_mutex); memset(&thread_control, 0, sizeof(davs2_win32thread_control_t)); } int davs2_thread_num_processors_np() { DWORD_PTR system_cpus, process_cpus = 0; int cpus = 0; DWORD_PTR bit; /* GetProcessAffinityMask returns affinities of 0 when the process has threads in multiple processor groups. * On platforms that support processor grouping, use GetThreadGroupAffinity to get the current thread's affinity instead. */ #if ARCH_X86_64 /* find function pointers to API functions specific to x86_64 platforms, if they exist. * BOOL GetThreadGroupAffinity(_In_ HANDLE hThread, _Out_ PGROUP_AFFINITY GroupAffinity); */ typedef BOOL(*get_thread_affinity_t)(HANDLE thread, davs2_group_affinity_t *group_affinity); HMODULE kernel_dll = GetModuleHandle(TEXT("kernel32.dll")); get_thread_affinity_t get_thread_affinity = (get_thread_affinity_t)GetProcAddress(kernel_dll, "GetThreadGroupAffinity"); if (get_thread_affinity) { /* running on a platform that supports >64 logical cpus */ davs2_group_affinity_t thread_affinity; if (get_thread_affinity(GetCurrentThread(), &thread_affinity)) { process_cpus = thread_affinity.mask; } } #endif if (!process_cpus) { GetProcessAffinityMask(GetCurrentProcess(), &process_cpus, &system_cpus); } for (bit = 1; bit; bit <<= 1) { cpus += !!(process_cpus & bit); } return cpus ? cpus : 1; } #endif // #if HAVE_WIN32THREAD davs2-1.6/source/common/win32thread.h000066400000000000000000000105361337322544400175060ustar00rootroot00000000000000/***************************************************************************** * win32thread.h: windows threading ***************************************************************************** * Copyright (C) 2010-2017 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* * changes of this file: * modified for davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. */ #ifndef DAVS2_WIN32THREAD_H #define DAVS2_WIN32THREAD_H #define WIN32_LEAN_AND_MEAN #include /* the following macro is used within xavs2 encoder */ #undef ERROR #ifdef __cplusplus extern "C" { #endif typedef struct { void *handle; void *(*func)(void *arg); void *arg; void *ret; } davs2_thread_t; #define davs2_thread_attr_t int /* the conditional variable api for windows 6.0+ uses critical sections and not mutexes */ typedef CRITICAL_SECTION davs2_thread_mutex_t; #define DAVS2_THREAD_MUTEX_INITIALIZER {0} #define davs2_thread_mutexattr_t int #define pthread_exit(a) /* This is the CONDITIONAL_VARIABLE typedef for using Window's native conditional variables on kernels 6.0+. * MinGW does not currently have this typedef. */ typedef struct { void *ptr; } davs2_thread_cond_t; #define davs2_thread_condattr_t int #define davs2_thread_create FPFX(thread_create) int davs2_thread_create(davs2_thread_t *thread, const davs2_thread_attr_t *attr, void *(*start_routine)(void *), void *arg); #define davs2_thread_join FPFX(thread_join) int davs2_thread_join(davs2_thread_t thread, void **value_ptr); #define davs2_thread_mutex_init FPFX(thread_mutex_init) int davs2_thread_mutex_init(davs2_thread_mutex_t *mutex, const davs2_thread_mutexattr_t *attr); #define davs2_thread_mutex_destroy FPFX(thread_mutex_destroy) int davs2_thread_mutex_destroy(davs2_thread_mutex_t *mutex); #define davs2_thread_mutex_lock FPFX(thread_mutex_lock) int davs2_thread_mutex_lock(davs2_thread_mutex_t *mutex); #define davs2_thread_mutex_unlock FPFX(thread_mutex_unlock) int davs2_thread_mutex_unlock(davs2_thread_mutex_t *mutex); #define davs2_thread_cond_init FPFX(thread_cond_init) int davs2_thread_cond_init(davs2_thread_cond_t *cond, const davs2_thread_condattr_t *attr); #define davs2_thread_cond_destroy FPFX(thread_cond_destroy) int davs2_thread_cond_destroy(davs2_thread_cond_t *cond); #define davs2_thread_cond_broadcast FPFX(thread_cond_broadcast) int davs2_thread_cond_broadcast(davs2_thread_cond_t *cond); #define davs2_thread_cond_wait FPFX(thread_cond_wait) int davs2_thread_cond_wait(davs2_thread_cond_t *cond, davs2_thread_mutex_t *mutex); #define davs2_thread_cond_signal FPFX(thread_cond_signal) int davs2_thread_cond_signal(davs2_thread_cond_t *cond); #define davs2_thread_attr_init(a) 0 #define davs2_thread_attr_destroy(a) 0 #define davs2_win32_threading_init FPFX(win32_threading_init) int davs2_win32_threading_init(void); #define davs2_win32_threading_destroy FPFX(win32_threading_destroy) void davs2_win32_threading_destroy(void); #define davs2_thread_num_processors_np FPFX(thread_num_processors_np) int davs2_thread_num_processors_np(void); #ifdef __cplusplus } #endif #endif // DAVS2_WIN32THREAD_H davs2-1.6/source/common/x86/000077500000000000000000000000001337322544400156235ustar00rootroot00000000000000davs2-1.6/source/common/x86/blockcopy8.asm000066400000000000000000005005151337322544400204100ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Praveen Kumar Tiwari ;* Murugan Vairavel ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 cextern pb_4 cextern pb_1 cextern pb_16 cextern pb_64 cextern pw_4 cextern pb_8 cextern pb_32 cextern pb_128 SECTION .text ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x4, 4, 7, 0 mov r4w, [r2] mov r5w, [r2 + r3] mov r6w, [r2 + 2 * r3] lea r3, [r3 + 2 * r3] mov r3w, [r2 + r3] mov [r0], r4w mov [r0 + r1], r5w mov [r0 + 2 * r1], r6w lea r1, [r1 + 2 * r1] mov [r0 + r1], r3w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x8, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x16, 4, 7, 0 lea r5, [3 * r1] lea r6, [3 * r3] mov r4w, [r2] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %rep 3 lea r2, [r2 + 4 * r3] mov r4w, [r2] lea r0, [r0 + 4 * r1] mov [r0], r4w mov r4w, [r2 + r3] mov [r0 + r1], r4w mov r4w, [r2 + 2 * r3] mov [r0 + 2 * r1], r4w mov r4w, [r2 + r6] mov [r0 + r5], r4w %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x2, 4, 6, 0 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x4, 4, 4, 4 movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movd m3, [r2 + r3] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movd [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_4x8, 4, 6, 4 lea r4, [3 * r1] lea r5, [3 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r5] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 7, 4 mov r4d, %2/8 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] movd m2, [r2 + 2 * r3] movd m3, [r2 + r6] lea r0, [r0 + 4 * r1] movd [r0], m0 movd [r0 + r1], m1 movd [r0 + 2 * r1], m2 movd [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PP_W4_H8 4, 16 BLOCKCOPY_PP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x8, 4, 7, 3 movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] movd m2, [r2 + 2 * r3] mov r6w, [r2 + 2 * r3 + 4] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w movd [r0 + 2 * r1], m2 mov [r0 + 2 * r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2 + r3] mov r4w, [r2 + r3 + 4] movd m1, [r2 + 2 * r3] mov r5w, [r2 + 2 * r3 + 4] lea r2, [r2 + 2 * r3] movd m2, [r2 + r3] mov r6w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0 + r1], m0 mov [r0 + r1 + 4], r4w movd [r0 + 2 * r1], m1 mov [r0 + 2 * r1 + 4], r5w lea r0, [r0 + 2 * r1] movd [r0 + r1], m2 mov [r0 + r1 + 4], r6w lea r2, [r2 + 2 * r3] movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r0, [r0 + 2 * r1] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_6x16, 4, 7, 2 mov r6d, 16/2 .loop: movd m0, [r2] mov r4w, [r2 + 4] movd m1, [r2 + r3] mov r5w, [r2 + r3 + 4] lea r2, [r2 + r3 * 2] movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x2, 4, 4, 2 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x4, 4, 4, 4 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r3, [r3 + r3 * 2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 lea r1, [r1 + 2 * r1] movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x6, 4, 4, 6 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + 2 * r3] movh m2, [r2] movh m3, [r2 + r3] lea r2, [r2 + 2 * r3] movh m4, [r2] movh m5, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] movh [r0], m4 movh [r0 + r1], m5 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x12, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 2 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x8, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x16, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 3 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x32, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 7 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_8x64, 4, 6, 4 lea r4, [3 * r3] lea r5, [3 * r1] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %rep 15 lea r2, [r2 + 4 * r3] movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] movh m3, [r2 + r4] lea r0, [r0 + 4 * r1] movh [r0], m0 movh [r0 + r1], m1 movh [r0 + 2 * r1], m2 movh [r0 + r5], m3 %endrep RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] lea r2, [r2 + 2 * r3] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] movh m0, [r2] movd m1, [r2 + 8] movh m2, [r2 + r3] movd m3, [r2 + r3 + 8] movh [r0], m0 movd [r0 + 8], m1 movh [r0 + r1], m2 movd [r0 + r1 + 8], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W12_H4 12, 16 BLOCKCOPY_PP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H4 16, 4 BLOCKCOPY_PP_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/8 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + r3] lea r2, [r2 + 2 * r3] movu m4, [r2] movu m5, [r2 + r3] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movu [r0 + r1], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W16_H8 16, 8 BLOCKCOPY_PP_W16_H8 16, 16 BLOCKCOPY_PP_W16_H8 16, 32 BLOCKCOPY_PP_W16_H8 16, 64 BLOCKCOPY_PP_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu m4, [r2] movh m5, [r2 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu [r0], m4 movh [r0 + 16], m5 movu m0, [r2 + r3] movh m1, [r2 + r3 + 16] movu [r0 + r1], m0 movh [r0 + r1 + 16], m1 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W24_H4 24, 32 BLOCKCOPY_PP_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H4 32, 8 BLOCKCOPY_PP_W32_H4 32, 16 BLOCKCOPY_PP_W32_H4 32, 24 BLOCKCOPY_PP_W32_H4 32, 32 BLOCKCOPY_PP_W32_H4 32, 64 BLOCKCOPY_PP_W32_H4 32, 48 INIT_YMM avx cglobal blockcopy_pp_32x8, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 RET INIT_YMM avx cglobal blockcopy_pp_32x16, 4, 6, 6 lea r4, [3 * r1] lea r5, [3 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_pp_32x24, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, 24/8 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W32_H16_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/16 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r2, [r2 + 4 * r3] movu m4, [r2] movu m5, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r0, [r0 + 4 * r1] movu [r0], m4 movu [r0 + r1], m5 movu m0, [r2 + 2 * r3] movu m1, [r2 + r5] lea r2, [r2 + 4 * r3] movu m2, [r2] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] movu m5, [r2 + r5] movu [r0 + 2 * r1], m0 movu [r0 + r4], m1 lea r0, [r0 + 4 * r1] movu [r0], m2 movu [r0 + r1], m3 movu [r0 + 2 * r1], m4 movu [r0 + r4], m5 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r4], m3 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W32_H16_avx 32, 32 BLOCKCOPY_PP_W32_H16_avx 32, 48 BLOCKCOPY_PP_W32_H16_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W48_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 5, 4 mov r4d, %2/4 .loop: movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] lea r2, [r2 + 2 * r3] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 lea r0, [r0 + 2 * r1] movu m0, [r2] movu xm1, [r2 + 32] movu m2, [r2 + r3] movu xm3, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu [r0 + r1], m2 movu [r0 + r1 + 32], xm3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W48_H4_avx 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + r1], m4 movu [r0 + r1 + 16], m5 movu m0, [r2 + r3 + 32] movu m1, [r2 + r3 + 48] lea r2, [r2 + 2 * r3] movu m2, [r2] movu m3, [r2 + 16] movu m4, [r2 + 32] movu m5, [r2 + 48] movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + 16], m3 movu [r0 + 32], m4 movu [r0 + 48], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4 64, 16 BLOCKCOPY_PP_W64_H4 64, 32 BLOCKCOPY_PP_W64_H4 64, 48 BLOCKCOPY_PP_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_pp_%1x%2, 4, 7, 6 lea r4, [3 * r1] lea r5, [3 * r3] mov r6d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 32] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + r1], m2 movu [r0 + r1 + 32], m3 movu [r0 + 2 * r1], m4 movu [r0 + 2 * r1 + 32], m5 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu [r0 + r4], m0 movu [r0 + r4 + 32], m1 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] dec r6d jnz .loop RET %endmacro BLOCKCOPY_PP_W64_H4_avx 64, 16 BLOCKCOPY_PP_W64_H4_avx 64, 32 BLOCKCOPY_PP_W64_H4_avx 64, 48 BLOCKCOPY_PP_W64_H4_avx 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x4, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_2x8, 4, 5, 2 add r3, r3 ;Row 0-1 movd m0, [r2] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0], r4w pextrw [r0 + r1], m0, 4 ;Row 2-3 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 4-5 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 ;Row 6-7 movd m0, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movd m1, [r2 + r3] packuswb m0, m1 movd r4d, m0 mov [r0 + 2 * r1], r4w lea r0, [r0 + 2 * r1] pextrw [r0 + r1], m0, 4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W2_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m1, m1 movd r4d, m0 movd r5d, m1 mov [r0], r4w mov [r0 + r1], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W2_H2 2, 4 BLOCKCOPY_SP_W2_H2 2, 8 BLOCKCOPY_SP_W2_H2 2, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] movh m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m3, [r2 + r3] movh m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m5, [r2 + r3] movh m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movh m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movd [r0], m0 pshufd m0, m0, 2 movd [r0 + r1], m0 movd [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] pshufd m2, m2, 2 movd [r0 + r1], m2 movd [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] pshufd m4, m4, 2 movd [r0 + r1], m4 movd [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] pshufd m6, m6, 2 movd [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W4_H8 4, 16 BLOCKCOPY_SP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_sp_6x8, 4, 4, 2 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movd [r0], m0 pextrw [r0 + 4], m0, 2 movhlps m0, m0 movd [r0 + r1], m0 pextrw [r0 + r1 + 4], m0, 2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W6_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride add r3, r3 mov r6d, %2/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r6d lea r2, [r2 + r3 * 2] packuswb m0, m0 packuswb m2, m2 packuswb m1, m1 packuswb m3, m3 movd r4d, m2 movd r5d, m3 movd [r0], m0 mov [r0 + 4], r4w movd [r0 + r1], m1 mov [r0 + r1 + 4], r5w lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W6_H2 6, 8 BLOCKCOPY_SP_W6_H2 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] packuswb m0, m1 movlps [r0], m0 movhps [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 RET ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride add r3, r3 mov r4d, %2/4 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] packuswb m0, m1 packuswb m2, m3 movlps [r0], m0 movhps [r0 + r1], m0 lea r0, [r0 + r1 * 2] movlps [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H4 8, 12 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m3, [r2 + r3] movu m4, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m5, [r2 + r3] movu m6, [r2 + 2 * r3] lea r2, [r2 + 2 * r3] movu m7, [r2 + r3] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movlps [r0], m0 movhps [r0 + r1], m0 movlps [r0 + 2 * r1], m2 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m2 movlps [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m4 movlps [r0 + 2 * r1], m6 lea r0, [r0 + 2 * r1] movhps [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W8_H8 8, 16 BLOCKCOPY_SP_W8_H8 8, 32 BLOCKCOPY_SP_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movh [r0], m0 pshufd m0, m0, 2 movd [r0 + 8], m0 movh [r0 + r1], m2 pshufd m2, m2, 2 movd [r0 + r1 + 8], m2 movh [r0 + 2 * r1], m4 pshufd m4, m4, 2 movd [r0 + 2 * r1 + 8], m4 lea r0, [r0 + 2 * r1] movh [r0 + r1], m6 pshufd m6, m6, 2 movd [r0 + r1 + 8], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W12_H4 12, 16 BLOCKCOPY_SP_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu m4, [r2 + 2 * r3] movu m5, [r2 + 2 * r3 + 16] lea r2, [r2 + 2 * r3] movu m6, [r2 + r3] movu m7, [r2 + r3 + 16] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + r1], m2 movu [r0 + 2 * r1], m4 lea r0, [r0 + 2 * r1] movu [r0 + r1], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H4 16, 4 BLOCKCOPY_SP_W16_H4 16, 8 BLOCKCOPY_SP_W16_H4 16, 12 BLOCKCOPY_SP_W16_H4 16, 16 BLOCKCOPY_SP_W16_H4 16, 32 BLOCKCOPY_SP_W16_H4 16, 64 BLOCKCOPY_SP_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W16_H8_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/8 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r2, [r2 + 4 * r3] movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 lea r0, [r0 + 4 * r1] movu [r0], xm0 movu [r0 + r1], xm1 movu [r0 + 2 * r1], xm2 movu [r0 + r6], xm3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W16_H8_avx2 16, 16 BLOCKCOPY_SP_W16_H8_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W24_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movlps [r0 + 16], m2 movhps [r0 + r1], m2 movu [r0 + r1 + 8], m4 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W24_H2 24, 32 BLOCKCOPY_SP_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2/2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + r3] movu m5, [r2 + r3 + 16] movu m6, [r2 + r3 + 32] movu m7, [r2 + r3 + 48] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + r1], m4 movu [r0 + r1 + 16], m6 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H2 32, 8 BLOCKCOPY_SP_W32_H2 32, 16 BLOCKCOPY_SP_W32_H2 32, 24 BLOCKCOPY_SP_W32_H2 32, 32 BLOCKCOPY_SP_W32_H2 32, 64 BLOCKCOPY_SP_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + r3] movu m3, [r2 + r3 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + r5] movu m3, [r2 + r5 + 32] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W32_H4_avx2 32, 32 BLOCKCOPY_SP_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H1 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride mov r4d, %2 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu m6, [r2 + 96] movu m7, [r2 + 112] packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 movu [r0], m0 movu [r0 + 16], m2 movu [r0 + 32], m4 movu [r0 + 48], m6 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H1 64, 16 BLOCKCOPY_SP_W64_H1 64, 32 BLOCKCOPY_SP_W64_H1 64, 48 BLOCKCOPY_SP_W64_H1 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W64_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0], m0 movu [r0 + 32], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r1], m0 movu [r0 + r1 + 32], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu m3, [r2 + r5 + 96] packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r0 + r6], m0 movu [r0 + r6 + 32], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SP_W64_H4_avx2 64, 64 ;----------------------------------------------------------------------------- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val add r1, r1 movd m0, r2d pshuflw m0, m0, 0 movh [r0], m0 movh [r0 + r1], m0 movh [r0 + 2 * r1], m0 lea r0, [r0 + 2 * r1] movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_8x8, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_16x16(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockfill_s_16x16, 3, 4, 1, dst, dstStride, val add r1, r1 lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 movu [r0], m0 movu [r0 + 16], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + r3], m0 movu [r0 + r3 + 16], m0 RET INIT_YMM avx2 cglobal blockfill_s_16x16, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- %macro BLOCKFILL_S_W32_H8 2 INIT_XMM sse2 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val mov r3d, %2/8 add r1, r1 lea r4, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 pshufd m0, m0, 0 .loop: movu [r0], m0 movu [r0 + 16], m0 movu [r0 + 32], m0 movu [r0 + 48], m0 movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 movu [r0 + 4 * r1], m0 movu [r0 + 4 * r1 + 16], m0 movu [r0 + 4 * r1 + 32], m0 movu [r0 + 4 * r1 + 48], m0 lea r0, [r0 + 4 * r1] movu [r0 + r1], m0 movu [r0 + r1 + 16], m0 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 16], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 movu [r0 + r4], m0 movu [r0 + r4 + 16], m0 movu [r0 + r4 + 32], m0 movu [r0 + r4 + 48], m0 lea r0, [r0 + 4 * r1] dec r3d jnz .loop RET %endmacro BLOCKFILL_S_W32_H8 32, 32 INIT_YMM avx2 cglobal blockfill_s_32x32, 3, 4, 1 add r1, r1 lea r3, [3 * r1] movd xm0, r2d vpbroadcastw m0, xm0 movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 lea r0, [r0 + 4 * r1] movu [r0], m0 movu [r0 + 32], m0 movu [r0 + r1], m0 movu [r0 + r1 + 32], m0 movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m0 movu [r0 + r3], m0 movu [r0 + r3 + 32], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movd [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movd [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movd [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride add r1, r1 mov r4d, 16/2 .loop: movd m0, [r2] movd m1, [r2 + r3] dec r4d lea r2, [r2 + r3 * 2] pmovzxbw m0, m0 pmovzxbw m1, m1 movd [r0], m0 movd [r0 + r1], m1 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W4_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movd m0, [r2] pmovzxbw m0, m0 movh [r0], m0 movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 movd m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movd m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W4_H4 4, 8 BLOCKCOPY_PS_W4_H4 4, 16 BLOCKCOPY_PS_W4_H4 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W6_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movh [r0], m0 pextrd [r0 + 8], m0, 2 movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movh [r0 + 2 * r1], m0 pextrd [r0 + 2 * r1 + 8], m0, 2 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movh [r0 + r1], m0 pextrd [r0 + r1 + 8], m0, 2 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W6_H4 6, 8 BLOCKCOPY_PS_W6_H4 6, 16 ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride add r1, r1 movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W8_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 .loop: movh m0, [r2] pmovzxbw m0, m0 movu [r0], m0 movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 movh m0, [r2 + 2 * r3] pmovzxbw m0, m0 movu [r0 + 2 * r1], m0 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movh m0, [r2 + r3] pmovzxbw m0, m0 movu [r0 + r1], m0 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W8_H4 8, 8 BLOCKCOPY_PS_W8_H4 8, 16 BLOCKCOPY_PS_W8_H4 8, 32 BLOCKCOPY_PS_W8_H4 8, 12 BLOCKCOPY_PS_W8_H4 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W12_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movh [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movh [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W12_H2 12, 16 BLOCKCOPY_PS_W12_H2 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride add r1, r1 pxor m0, m0 movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/4 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + 2 * r3] pmovzxbw m2, m1 movu [r0 + 2 * r1], m2 punpckhbw m1, m0 movu [r0 + 2 * r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4 16, 8 BLOCKCOPY_PS_W16_H4 16, 12 BLOCKCOPY_PS_W16_H4 16, 16 BLOCKCOPY_PS_W16_H4 16, 32 BLOCKCOPY_PS_W16_H4 16, 64 BLOCKCOPY_PS_W16_H4 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W16_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 3 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] pxor m0, m0 .loop: movu xm1, [r2] pmovzxbw m2, xm1 movu [r0], m2 movu xm1, [r2 + r3] pmovzxbw m2, xm1 movu [r0 + r1], m2 movu xm1, [r2 + 2 * r3] pmovzxbw m2, xm1 movu [r0 + 2 * r1], m2 movu xm1, [r2 + r5] pmovzxbw m2, xm1 movu [r0 + r6], m2 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W16_H4_avx2 16, 16 BLOCKCOPY_PS_W16_H4_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W24_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movh m1, [r2 + 16] pmovzxbw m1, m1 movu [r0 + 32], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movh m1, [r2 + r3 + 16] pmovzxbw m1, m1 movu [r0 + r1 + 32], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W24_H2 24, 32 BLOCKCOPY_PS_W24_H2 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H2 32, 8 BLOCKCOPY_PS_W32_H2 32, 16 BLOCKCOPY_PS_W32_H2 32, 24 BLOCKCOPY_PS_W32_H2 32, 32 BLOCKCOPY_PS_W32_H2 32, 64 BLOCKCOPY_PS_W32_H2 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W32_H4_avx2 2 INIT_YMM avx2 cglobal blockcopy_ps_%1x%2, 4, 7, 2 add r1, r1 mov r4d, %2/4 lea r5, [3 * r3] lea r6, [3 * r1] .loop: pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] movu [r0 + 0], m0 movu [r0 + 32], m1 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W48_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PS_W64_H2 2 INIT_XMM sse4 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride add r1, r1 mov r4d, %2/2 pxor m0, m0 .loop: movu m1, [r2] pmovzxbw m2, m1 movu [r0], m2 punpckhbw m1, m0 movu [r0 + 16], m1 movu m1, [r2 + 16] pmovzxbw m2, m1 movu [r0 + 32], m2 punpckhbw m1, m0 movu [r0 + 48], m1 movu m1, [r2 + 32] pmovzxbw m2, m1 movu [r0 + 64], m2 punpckhbw m1, m0 movu [r0 + 80], m1 movu m1, [r2 + 48] pmovzxbw m2, m1 movu [r0 + 96], m2 punpckhbw m1, m0 movu [r0 + 112], m1 movu m1, [r2 + r3] pmovzxbw m2, m1 movu [r0 + r1], m2 punpckhbw m1, m0 movu [r0 + r1 + 16], m1 movu m1, [r2 + r3 + 16] pmovzxbw m2, m1 movu [r0 + r1 + 32], m2 punpckhbw m1, m0 movu [r0 + r1 + 48], m1 movu m1, [r2 + r3 + 32] pmovzxbw m2, m1 movu [r0 + r1 + 64], m2 punpckhbw m1, m0 movu [r0 + r1 + 80], m1 movu m1, [r2 + r3 + 48] pmovzxbw m2, m1 movu [r0 + r1 + 96], m2 punpckhbw m1, m0 movu [r0 + r1 + 112], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_PS_W64_H2 64, 16 BLOCKCOPY_PS_W64_H2 64, 32 BLOCKCOPY_PS_W64_H2 64, 48 BLOCKCOPY_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal blockcopy_ps_64x64, 4, 7, 4 add r1, r1 mov r4d, 64/8 lea r5, [3 * r3] lea r6, [3 * r1] .loop: %rep 2 pmovzxbw m0, [r2 + 0] pmovzxbw m1, [r2 + 16] pmovzxbw m2, [r2 + 32] pmovzxbw m3, [r2 + 48] movu [r0 + 0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 pmovzxbw m0, [r2 + r3 + 0] pmovzxbw m1, [r2 + r3 + 16] pmovzxbw m2, [r2 + r3 + 32] pmovzxbw m3, [r2 + r3 + 48] movu [r0 + r1 + 0], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 pmovzxbw m0, [r2 + r3 * 2 + 0] pmovzxbw m1, [r2 + r3 * 2 + 16] pmovzxbw m2, [r2 + r3 * 2 + 32] pmovzxbw m3, [r2 + r3 * 2 + 48] movu [r0 + r1 * 2 + 0], m0 movu [r0 + r1 * 2 + 32], m1 movu [r0 + r1 * 2 + 64], m2 movu [r0 + r1 * 2 + 96], m3 pmovzxbw m0, [r2 + r5 + 0] pmovzxbw m1, [r2 + r5 + 16] pmovzxbw m2, [r2 + r5 + 32] pmovzxbw m3, [r2 + r5 + 48] movu [r0 + r6 + 0], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 movu [r0 + r6 + 96], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep dec r4d jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x4, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x8, 4, 6, 0 add r1, r1 add r3, r3 mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] mov r4d, [r2] mov r5d, [r2 + r3] mov [r0], r4d mov [r0 + r1], r5d RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_2x16, 4, 7, 0 add r1, r1 add r3, r3 mov r6d, 16/2 .loop: mov r4d, [r2] mov r5d, [r2 + r3] dec r6d lea r2, [r2 + r3 * 2] mov [r0], r4d mov [r0 + r1], r5d lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x2, 4, 4, 2 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_4x4, 4, 4, 4 add r1, r1 add r3, r3 movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W4_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movh m0, [r2] movh m1, [r2 + r3] lea r2, [r2 + r3 * 2] movh m2, [r2] movh m3, [r2 + r3] movh [r0], m0 movh [r0 + r1], m1 lea r0, [r0 + 2 * r1] movh [r0], m2 movh [r0 + r1], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W4_H8 4, 8 BLOCKCOPY_SS_W4_H8 4, 16 BLOCKCOPY_SS_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x8, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] movu m0, [r2] movu m1, [r2 + r3] pshufd m2, m0, 2 pshufd m3, m1, 2 movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_6x16, 4, 5, 4 add r1, r1 add r3, r3 mov r4d, 16/2 .loop: movh m0, [r2] movd m2, [r2 + 8] movh m1, [r2 + r3] movd m3, [r2 + r3 + 8] dec r4d lea r2, [r2 + r3 * 2] movh [r0], m0 movd [r0 + 8], m2 movh [r0 + r1], m1 movd [r0 + r1 + 8], m3 lea r0, [r0 + r1 * 2] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x2, 4, 4, 2 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x4, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x6, 4, 4, 4 add r1, r1 add r3, r3 movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + r3 * 2] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_ss_8x12, 4, 5, 2 add r1, r1 add r3, r3 mov r4d, 12/2 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + 2 * r3] dec r4d movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W8_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + r3] lea r2, [r2 + r3 * 2] movu m2, [r2] movu m3, [r2 + r3] movu [r0], m0 movu [r0 + r1], m1 lea r0, [r0 + 2 * r1] movu [r0], m2 movu [r0 + r1], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W8_H8 8, 8 BLOCKCOPY_SS_W8_H8 8, 16 BLOCKCOPY_SS_W8_H8 8, 32 BLOCKCOPY_SS_W8_H8 8, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W12_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] lea r2, [r2 + 2 * r3] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 lea r0, [r0 + 2 * r1] movu m0, [r2] movh m1, [r2 + 16] movu m2, [r2 + r3] movh m3, [r2 + r3 + 16] movu [r0], m0 movh [r0 + 16], m1 movu [r0 + r1], m2 movh [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W12_H4 12, 16 BLOCKCOPY_SS_W12_H4 12, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4 16, 4 BLOCKCOPY_SS_W16_H4 16, 12 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + r3] movu m2, [r2 + 2 * r3] movu m3, [r2 + r5] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + 2 * r1], m2 movu [r0 + r6], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H4_avx 16, 4 BLOCKCOPY_SS_W16_H4_avx 16, 12 BLOCKCOPY_SS_W16_H4_avx 16, 8 BLOCKCOPY_SS_W16_H4_avx 16, 16 BLOCKCOPY_SS_W16_H4_avx 16, 24 BLOCKCOPY_SS_W16_H4_avx 16, 32 BLOCKCOPY_SS_W16_H4_avx 16, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W16_H8 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/8 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + r3] movu m3, [r2 + r3 + 16] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + r1], m2 movu [r0 + r1 + 16], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W16_H8 16, 8 BLOCKCOPY_SS_W16_H8 16, 16 BLOCKCOPY_SS_W16_H8 16, 32 BLOCKCOPY_SS_W16_H8 16, 64 BLOCKCOPY_SS_W16_H8 16, 24 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + r3] movu m4, [r2 + r3 + 16] movu m5, [r2 + r3 + 32] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + r1], m3 movu [r0 + r1 + 16], m4 movu [r0 + r1 + 32], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4 24, 32 BLOCKCOPY_SS_W24_H4 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W24_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 2 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu xm1, [r2 + 32] movu [r0], m0 movu [r0 + 32], xm1 movu m0, [r2 + r3] movu xm1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], xm1 movu m0, [r2 + 2 * r3] movu xm1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], xm1 movu m0, [r2 + r5] movu xm1, [r2 + r5 + 32] movu [r0 + r6], m0 movu [r0 + r6 + 32], xm1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W24_H4_avx 24, 32 BLOCKCOPY_SS_W24_H4_avx 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 4 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4 32, 8 BLOCKCOPY_SS_W32_H4 32, 16 BLOCKCOPY_SS_W32_H4 32, 24 BLOCKCOPY_SS_W32_H4 32, 32 BLOCKCOPY_SS_W32_H4 32, 64 BLOCKCOPY_SS_W32_H4 32, 48 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W32_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4 mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W32_H4_avx 32, 8 BLOCKCOPY_SS_W32_H4_avx 32, 16 BLOCKCOPY_SS_W32_H4_avx 32, 24 BLOCKCOPY_SS_W32_H4_avx 32, 32 BLOCKCOPY_SS_W32_H4_avx 32, 48 BLOCKCOPY_SS_W32_H4_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6 mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu m4, [r2 + 64] movu m5, [r2 + 80] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu [r0 + 64], m4 movu [r0 + 80], m5 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu m4, [r2 + r3 + 64] movu m5, [r2 + r3 + 80] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu [r0 + r1 + 64], m4 movu [r0 + r1 + 80], m5 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W48_H2 48, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_48x64(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_YMM avx cglobal blockcopy_ss_48x64, 4, 7, 6 mov r4d, 64/4 add r1, r1 add r3, r3 lea r5, [3 * r3] lea r6, [3 * r1] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu m0, [r2 + r5] movu m1, [r2 + r5 + 32] movu m2, [r2 + r5 + 64] movu [r0 + r6], m0 movu [r0 + r6 + 32], m1 movu [r0 + r6 + 64], m2 dec r4d lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] jnz .loop RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 .loop: movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] movu m0, [r2] movu m1, [r2 + 16] movu m2, [r2 + 32] movu m3, [r2 + 48] movu [r0], m0 movu [r0 + 16], m1 movu [r0 + 32], m2 movu [r0 + 48], m3 movu m0, [r2 + 64] movu m1, [r2 + 80] movu m2, [r2 + 96] movu m3, [r2 + 112] movu [r0 + 64], m0 movu [r0 + 80], m1 movu [r0 + 96], m2 movu [r0 + 112], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 16] movu m2, [r2 + r3 + 32] movu m3, [r2 + r3 + 48] movu [r0 + r1], m0 movu [r0 + r1 + 16], m1 movu [r0 + r1 + 32], m2 movu [r0 + r1 + 48], m3 movu m0, [r2 + r3 + 64] movu m1, [r2 + r3 + 80] movu m2, [r2 + r3 + 96] movu m3, [r2 + r3 + 112] movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m1 movu [r0 + r1 + 96], m2 movu [r0 + r1 + 112], m3 dec r4d lea r2, [r2 + 2 * r3] lea r0, [r0 + 2 * r1] jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4 64, 16 BLOCKCOPY_SS_W64_H4 64, 32 BLOCKCOPY_SS_W64_H4 64, 48 BLOCKCOPY_SS_W64_H4 64, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_SS_W64_H4_avx 2 INIT_YMM avx cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride mov r4d, %2/4 add r1, r1 add r3, r3 lea r5, [3 * r1] lea r6, [3 * r3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r2 + 64] movu m3, [r2 + 96] movu [r0], m0 movu [r0 + 32], m1 movu [r0 + 64], m2 movu [r0 + 96], m3 movu m0, [r2 + r3] movu m1, [r2 + r3 + 32] movu m2, [r2 + r3 + 64] movu m3, [r2 + r3 + 96] movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu [r0 + r1 + 64], m2 movu [r0 + r1 + 96], m3 movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 32] movu m2, [r2 + 2 * r3 + 64] movu m3, [r2 + 2 * r3 + 96] movu [r0 + 2 * r1], m0 movu [r0 + 2 * r1 + 32], m1 movu [r0 + 2 * r1 + 64], m2 movu [r0 + 2 * r1 + 96], m3 movu m0, [r2 + r6] movu m1, [r2 + r6 + 32] movu m2, [r2 + r6 + 64] movu m3, [r2 + r6 + 96] lea r2, [r2 + 4 * r3] movu [r0 + r5], m0 movu [r0 + r5 + 32], m1 movu [r0 + r5 + 64], m2 movu [r0 + r5 + 96], m3 lea r0, [r0 + 4 * r1] dec r4d jnz .loop RET %endmacro BLOCKCOPY_SS_W64_H4_avx 64, 16 BLOCKCOPY_SS_W64_H4_avx 64, 32 BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_4, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; m1 - word [-round] ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_8, 3, 5, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift ; m1 - word [-round] .loop: ; Row 0-1 mova m2, [r1] mova m3, [r1 + r2] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2-3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0], m2 movu [r0 + 32], m3 ; Row 4-7 lea r1, [r1 + 4 * r2] movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu xm3, [r1 + 2 * r2] vinserti128 m3, m3, [r1 + r3], 1 psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 64], m2 movu [r0 + 96], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_16, 3, 4, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_16, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 16/8 .loop: ; Row 0-1 movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 2-3 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 4-5 lea r1, [r1 + 4 * r2] movu m2, [r1] movu m3, [r1 + r2] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 6-7 movu m2, [r1 + 2 * r2] movu m3, [r1 + r3] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift ; m1 - word [-round] .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy2Dto1D_shr_32, 4, 5, 4 add r2d, r2d movd xm0, r3d pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] mov r4d, 32/4 .loop: ; Row 0 movu m2, [r1] movu m3, [r1 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 0 * mmsize], m2 movu [r0 + 1 * mmsize], m3 ; Row 1 movu m2, [r1 + r2] movu m3, [r1 + r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 2 * mmsize], m2 movu [r0 + 3 * mmsize], m3 ; Row 2 movu m2, [r1 + 2 * r2] movu m3, [r1 + 2 * r2 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 4 * mmsize], m2 movu [r0 + 5 * mmsize], m3 ; Row 3 movu m2, [r1 + r3] movu m3, [r1 + r3 + 32] psubw m2, m1 psraw m2, xm0 psubw m3, m1 psraw m3, xm0 movu [r0 + 6 * mmsize], m2 movu [r0 + 7 * mmsize], m3 add r0, 8 * mmsize lea r1, [r1 + 4 * r2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_4, 3, 3, 3 add r2d, r2d movd m0, r3m ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] psllw m1, m0 psllw m2, m0 movh [r0], m1 movhps [r0 + r2], m1 movh [r0 + r2 * 2], m2 lea r2, [r2 * 3] movhps [r0 + r2], m2 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_4, 3, 3, 2 add r2d, r2d movd xm0, r3m ; Row 0-3 movu m1, [r1] psllw m1, xm0 vextracti128 xm0, m1, 1 movq [r0], xm1 movhps [r0 + r2], xm1 lea r0, [r0 + r2 * 2] movq [r0], xm0 movhps [r0 + r2], xm0 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_8, 3, 4, 5 add r2d, r2d movd m0, r3m lea r3, [r2 * 3] ; Row 0-3 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 lea r0, [r0 + r2 * 4] ; Row 4-7 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + r2], m2 mova [r0 + r2 * 2], m3 mova [r0 + r3], m4 RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_8, 3, 4, 3 add r2d, r2d movd xm0, r3m lea r3, [r2 * 3] ; Row 0-3 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 ; Row 4-7 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psllw m1, xm0 psllw m2, xm0 movu [r0], xm1 vextracti128 [r0 + r2], m1, 1 movu [r0 + r2 * 2], xm2 vextracti128 [r0 + r3], m2, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_16, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 16/4 .loop: ; Row 0-1 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 ; Row 2-3 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 2] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0], m1 mova [r0 + 16], m2 mova [r0 + r2], m3 mova [r0 + r2 + 16], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0], m1 movu [r0 + r2], m2 ; Row 2-3 movu m1, [r1 + 2 * mmsize] movu m2, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 movu [r0 + r2 * 2], m1 movu [r0 + r4], m2 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd m0, r3m mov r3d, 32/2 .loop: ; Row 0 mova m1, [r1 + 0 * mmsize] mova m2, [r1 + 1 * mmsize] mova m3, [r1 + 2 * mmsize] mova m4, [r1 + 3 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + 0 * mmsize], m1 mova [r0 + 1 * mmsize], m2 mova [r0 + 2 * mmsize], m3 mova [r0 + 3 * mmsize], m4 ; Row 1 mova m1, [r1 + 4 * mmsize] mova m2, [r1 + 5 * mmsize] mova m3, [r1 + 6 * mmsize] mova m4, [r1 + 7 * mmsize] psllw m1, m0 psllw m2, m0 psllw m3, m0 psllw m4, m0 mova [r0 + r2 + 0 * mmsize], m1 mova [r0 + r2 + 1 * mmsize], m2 mova [r0 + r2 + 2 * mmsize], m3 mova [r0 + r2 + 3 * mmsize], m4 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shl_32, 3, 4, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/2 .loop: ; Row 0-1 movu m1, [r1 + 0 * mmsize] movu m2, [r1 + 1 * mmsize] movu m3, [r1 + 2 * mmsize] movu m4, [r1 + 3 * mmsize] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + r2], m3 movu [r0 + r2 + mmsize], m4 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_4, 3,3,3 add r2d, r2d pxor m2, m2 ; row 0 & 1 movh m0, [r1] movhps m0, [r1 + r2] mova [r0], m0 ; row 2 & 3 movh m1, [r1 + r2 * 2] lea r2, [r2 * 3] movhps m1, [r1 + r2] mova [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m2 ; get count ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem %if 0 pmovmskb eax, m0 not ax popcnt ax, ax %else mova m1, [pb_1] paddb m0, m1 psadbw m0, m2 pshufd m1, m0, 2 paddw m0, m1 movd eax, m0 %endif RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_8, 3,3,6 add r2d, r2d pxor m4, m4 pxor m5, m5 ; row 0 & 1 movu m0, [r1] movu m1, [r1 + r2] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 & 3 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 4 & 5 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 6 & 7 lea r1, [r1 + 2 * r2] movu m0, [r1] movu m1, [r1 + r2] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; get count mova m0, [pb_4] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_8, 3,4,5 add r2d, r2d lea r3, [r2 * 3] ; row 0 - 1 movu xm0, [r1] vinserti128 m0, m0, [r1 + r2], 1 movu [r0], m0 ; row 2 - 3 movu xm1, [r1 + r2 * 2] vinserti128 m1, m1, [r1 + r3], 1 movu [r0 + 32], m1 lea r1, [r1 + r2 * 4] ; row 4 - 5 movu xm2, [r1] vinserti128 m2, m2, [r1 + r2], 1 movu [r0 + 64], m2 ; row 6 - 7 movu xm3, [r1 + r2 * 2] vinserti128 m3, m3, [r1 + r3], 1 movu [r0 + 96], m3 ; get count xorpd m4, m4 vpacksswb m0, m1 vpacksswb m2, m3 pminub m0, [pb_1] pminub m2, [pb_1] paddb m0, m2 vextracti128 xm1, m0, 1 paddb xm0, xm1 psadbw xm0, xm4 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_16, 3,4,6 add r2d, r2d mov r3d, 4 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 2 movu m0, [r1 + 2 * r2] movu m1, [r1 + 2 * r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 3 lea r1, [r1 + 2 * r2] movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop mova m0, [pb_16] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_16, 3, 5, 5 add r2d, r2d lea r3, [r2 * 3] mov r4d, 16/4 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 - 1 movu m0, [r1] movu [r0], m0 movu m1, [r1 + r2] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 2 - 3 movu m1, [r1 + r2 * 2] movu [r0 + 64], m1 movu m2, [r1 + r3] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 4 * r2] dec r4d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); ;-------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal copy_cnt_32, 3,4,6 add r2d, r2d mov r3d, 16 pxor m4, m4 pxor m5, m5 .loop: ; row 0 movu m0, [r1] movu m1, [r1 + 16] movu [r0], m0 movu [r0 + 16], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + 32] movu m1, [r1 + 48] movu [r0 + 32], m0 movu [r0 + 48], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 ; row 1 movu m0, [r1 + r2] movu m1, [r1 + r2 + 16] movu [r0 + 64], m0 movu [r0 + 80], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 movu m0, [r1 + r2 + 32] movu m1, [r1 + r2 + 48] movu [r0 + 96], m0 movu [r0 + 112], m1 packsswb m0, m1 pcmpeqb m0, m4 paddb m5, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count mova m0, [pb_64] paddb m5, m0 psadbw m5, m4 pshufd m0, m5, 2 paddw m5, m0 movd eax, m5 RET INIT_YMM avx2 cglobal copy_cnt_32, 3, 5, 5 add r2d, r2d mov r3d, 32/2 mova m3, [pb_1] xorpd m4, m4 .loop: ; row 0 movu m0, [r1] movu [r0], m0 movu m1, [r1 + 32] movu [r0 + 32], m1 packsswb m0, m1 pminub m0, m3 ; row 1 movu m1, [r1 + r2] movu [r0 + 64], m1 movu m2, [r1 + r2 + 32] movu [r0 + 96], m2 packsswb m1, m2 pminub m1, m3 paddb m0, m1 paddb m4, m0 add r0, 128 lea r1, [r1 + 2 * r2] dec r3d jnz .loop ; get count xorpd m0, m0 vextracti128 xm1, m4, 1 paddb xm4, xm1 psadbw xm4, xm0 movhlps xm1, xm4 paddd xm4, xm1 movd eax, xm4 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_4, 4, 4, 4 add r2d, r2d movd m0, r3d ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; m0 - shift ; Row 0-3 movh m2, [r1] movhps m2, [r1 + r2] lea r1, [r1 + r2 * 2] movh m3, [r1] movhps m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_8, 4, 5, 4 add r2d, r2d movd m0, r3d mov r3d, 8/4 lea r4, [r2 * 3] ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; r4 - stride * 3 ; m0 - shift .loop: ; Row 0, 1 mova m2, [r1] mova m3, [r1 + r2] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 2, 3 mova m2, [r1 + r2 * 2] mova m3, [r1 + r4] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_8(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_8, 4, 5, 2 add r2d, r2d movd xm0, r3d lea r4, [3 * r2] ; Row 0, 1 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0], m1 ; Row 2, 3 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 32], m1 lea r1, [r1 + 4 * r2] ; Row 4, 5 movu xm1, [r1] vinserti128 m1, m1, [r1 + r2], 1 psllw m1, xm0 movu [r0 + 64], m1 ; Row 6, 7 movu xm1, [r1 + 2 * r2] vinserti128 m1, m1, [r1 + r4], 1 psllw m1, xm0 movu [r0 + 96], m1 RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_16, 4, 4, 4 add r2d, r2d movd m0, r3d mov r3d, 16/2 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 ; Row 1 mova m2, [r1 + r2 + 0 * mmsize] mova m3, [r1 + r2 + 1 * mmsize] psllw m2, m0 psllw m3, m0 mova [r0 + 2 * mmsize], m2 mova [r0 + 3 * mmsize], m3 add r0, 4 * mmsize lea r1, [r1 + r2 * 2] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_16(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_16, 3, 5, 3 add r2d, r2d movd xm0, r3m mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + r2] psllw m1, xm0 psllw m2, xm0 movu [r0 + 0 * mmsize], m1 movu [r0 + 1 * mmsize], m2 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + r4] psllw m1, xm0 psllw m2, xm0 movu [r0 + 2 * mmsize], m1 movu [r0 + 3 * mmsize], m2 add r0, 4 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy2Dto1D_shl_32, 4, 4, 6 add r2d, r2d movd m0, r3d mov r3d, 32/1 ; register alloc ; r0 - dst ; r1 - src ; r2 - srcStride ; r3 - loop counter ; m0 - shift .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psllw m2, m0 psllw m3, m0 psllw m4, m0 psllw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 add r0, 4 * mmsize add r1, r2 dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl_32(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); ;-------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal cpy2Dto1D_shl_32, 3, 5, 5 add r2d, r2d movd xm0, r3m mov r3d, 32/4 lea r4, [3 * r2] .loop: ; Row 0-1 movu m1, [r1] movu m2, [r1 + 32] movu m3, [r1 + r2] movu m4, [r1 + r2 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0], m1 movu [r0 + mmsize], m2 movu [r0 + 2 * mmsize], m3 movu [r0 + 3 * mmsize], m4 ; Row 2-3 movu m1, [r1 + 2 * r2] movu m2, [r1 + 2 * r2 + 32] movu m3, [r1 + r4] movu m4, [r1 + r4 + 32] psllw m1, xm0 psllw m2, xm0 psllw m3, xm0 psllw m4, xm0 movu [r0 + 4 * mmsize], m1 movu [r0 + 5 * mmsize], m2 movu [r0 + 6 * mmsize], m3 movu [r0 + 7 * mmsize], m4 add r0, 8 * mmsize lea r1, [r1 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_4, 3, 3, 4 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, m0 psraw m3, m0 movh [r0], m2 movhps [r0 + r2], m2 movh [r0 + r2 * 2], m3 lea r2, [r2 * 3] movhps [r0 + r2], m3 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_4, 3, 3, 3 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 ; Row 0-3 movu m2, [r1] psubw m2, m1 psraw m2, xm0 vextracti128 xm1, m2, 1 movq [r0], xm2 movhps [r0 + r2], xm2 lea r0, [r0 + r2 * 2] movq [r0], xm1 movhps [r0 + r2], xm1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_8, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 ; Row 4-7 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + r2], m3 mova [r0 + r2 * 2], m4 mova [r0 + r3], m5 RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] ; Row 0-3 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 ; Row 4-7 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] lea r0, [r0 + r2 * 4] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], xm2 vextracti128 [r0 + r2], m2, 1 movu [r0 + r2 * 2], xm3 vextracti128 [r0 + r3], m3, 1 RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_16, 3, 5, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0], m2 mova [r0 + mmsize], m3 mova [r0 + r2], m4 mova [r0 + r2 + mmsize], m5 ; Row 2-3 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 * 2], m2 mova [r0 + r2 * 2 + mmsize], m3 mova [r0 + r4], m4 mova [r0 + r4 + mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_16, 3, 5, 4 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 16/4 lea r4, [r2 * 3] .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0], m2 movu [r0 + r2], m3 ; Row 2-3 movu m2, [r1 + 2 * mmsize] movu m3, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psraw m2, xm0 psraw m3, xm0 movu [r0 + r2 * 2], m2 movu [r0 + r4], m3 add r1, 4 * mmsize lea r0, [r0 + r2 * 4] dec r3d jnz .loop RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0 mova m2, [r1 + 0 * mmsize] mova m3, [r1 + 1 * mmsize] mova m4, [r1 + 2 * mmsize] mova m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + 0 * mmsize], m2 mova [r0 + 1 * mmsize], m3 mova [r0 + 2 * mmsize], m4 mova [r0 + 3 * mmsize], m5 ; Row 1 mova m2, [r1 + 4 * mmsize] mova m3, [r1 + 5 * mmsize] mova m4, [r1 + 6 * mmsize] mova m5, [r1 + 7 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, m0 psraw m3, m0 psraw m4, m0 psraw m5, m0 mova [r0 + r2 + 0 * mmsize], m2 mova [r0 + r2 + 1 * mmsize], m3 mova [r0 + r2 + 2 * mmsize], m4 mova [r0 + r2 + 3 * mmsize], m5 add r1, 8 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET INIT_YMM avx2 cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd xm0, r3m pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 32/2 .loop: ; Row 0-1 movu m2, [r1 + 0 * mmsize] movu m3, [r1 + 1 * mmsize] movu m4, [r1 + 2 * mmsize] movu m5, [r1 + 3 * mmsize] psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 psraw m2, xm0 psraw m3, xm0 psraw m4, xm0 psraw m5, xm0 movu [r0], m2 movu [r0 + mmsize], m3 movu [r0 + r2], m4 movu [r0 + r2 + mmsize], m5 add r1, 4 * mmsize lea r0, [r0 + r2 * 2] dec r3d jnz .loop RET davs2-1.6/source/common/x86/const-a.asm000066400000000000000000000206301337322544400176720ustar00rootroot00000000000000;***************************************************************************** ;* const-a.asm: x86 global constants ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Min Chen ;* Praveen Kumar Tiwari ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" SECTION_RODATA 32 ;; 8-bit constants const pb_0, times 32 db 0 const pb_1, times 32 db 1 const pb_2, times 32 db 2 const pb_3, times 32 db 3 const pb_4, times 32 db 4 const pb_8, times 32 db 8 const pb_15, times 32 db 15 const pb_16, times 32 db 16 const pb_31, times 32 db 31 const pb_32, times 32 db 32 const pb_64, times 32 db 64 const pb_124, times 32 db 124 const pb_128, times 32 db 128 const pb_a1, times 16 db 0xa1 const pb_01, times 8 db 0, 1 const pb_0123, times 4 db 0, 1 times 4 db 2, 3 const hsub_mul, times 16 db 1, -1 const pw_swap, times 2 db 6, 7, 4, 5, 2, 3, 0, 1 const pb_unpackbd1, times 2 db 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 const pb_unpackbd2, times 2 db 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7 const pb_unpackwq1, times 1 db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 const pb_unpackwq2, times 1 db 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7 const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6 const pb_movemask, times 16 db 0x00 times 16 db 0xFF const pb_movemask_32, times 32 db 0x00 times 32 db 0xFF times 32 db 0x00 const pb_0000000000000F0F, times 2 db 0xff, 0x00 times 12 db 0x00 const pb_000000000000000F, db 0xff times 15 db 0x00 const pb_shuf_off4, times 2 db 0, 4, 1, 5, 2, 6, 3, 7 const pw_shuf_off4, times 1 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ;; 16-bit constants const pw_n1, times 16 dw -1 const pw_1, times 16 dw 1 const pw_2, times 16 dw 2 const pw_3, times 16 dw 3 const pw_7, times 16 dw 7 const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_16, times 16 dw 16 const pw_15, times 16 dw 15 const pw_31, times 16 dw 31 const pw_32, times 16 dw 32 const pw_64, times 8 dw 64 const pw_128, times 16 dw 128 const pw_256, times 16 dw 256 const pw_257, times 16 dw 257 const pw_512, times 16 dw 512 const pw_1023, times 16 dw 1023 const pw_1024, times 16 dw 1024 const pw_2048, times 16 dw 2048 const pw_4096, times 16 dw 4096 const pw_8192, times 8 dw 8192 const pw_00ff, times 16 dw 0x00ff const pw_ff00, times 8 dw 0xff00 const pw_2000, times 16 dw 0x2000 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 16 dw 0x3fff const pw_32_0, times 4 dw 32, times 4 dw 0 const pw_pixel_max, times 16 dw ((1 << BIT_DEPTH)-1) const pw_0_7, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7 const pw_ppppmmmm, times 1 dw 1, 1, 1, 1, -1, -1, -1, -1 const pw_ppmmppmm, times 1 dw 1, 1, -1, -1, 1, 1, -1, -1 const pw_pmpmpmpm, times 16 dw 1, -1, 1, -1, 1, -1, 1, -1 const pw_pmmpzzzz, times 1 dw 1, -1, -1, 1, 0, 0, 0, 0 const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4 const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16 const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32 const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 const pw_FFFFFFFFFFFFFFF0, dw 0x00 times 7 dw 0xff const hmul_16p, times 16 db 1 times 8 db 1, -1 const pw_exp2_0_15, dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 const pw_1_ffff, times 4 dw 1 times 4 dw 0xFFFF ;; 32-bit constants const pd_0, times 8 dd 0 const pd_1, times 8 dd 1 const pd_2, times 8 dd 2 const pd_3, times 8 dd 3 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 const pd_11, times 4 dd 11 const pd_12, times 4 dd 12 const pd_15, times 8 dd 15 const pd_16, times 8 dd 16 const pd_31, times 8 dd 31 const pd_32, times 8 dd 32 const pd_64, times 4 dd 64 const pd_128, times 4 dd 128 const pd_256, times 4 dd 256 const pd_512, times 4 dd 512 const pd_1024, times 4 dd 1024 const pd_2048, times 4 dd 2048 const pd_ffff, times 4 dd 0xffff const pd_32767, times 4 dd 32767 const pd_n32768, times 4 dd 0xffff8000 const pd_524416, times 4 dd 524416 const pd_n32768, times 8 dd 0xffff8000 const pd_n131072, times 4 dd 0xfffe0000 const pd_0000ffff, times 8 dd 0x0000FFFF const pd_planar16_mul0, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const pd_planar16_mul1, times 1 dd 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const pd_planar32_mul1, times 1 dd 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 const pd_planar32_mul2, times 1 dd 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 const pd_planar16_mul2, times 1 dd 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const trans8_shuf, times 1 dd 0, 4, 1, 5, 2, 6, 3, 7 ;; 64-bit constants const pq_1, times 1 dq 1 davs2-1.6/source/common/x86/cpu-a.asm000066400000000000000000000133431337322544400173360ustar00rootroot00000000000000;***************************************************************************** ;* cpu-a.asm: x86 cpu utilities ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Laurent Aimar ;* Loren Merritt ;* Fiona Glaser ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" SECTION .text ;----------------------------------------------------------------------------- ; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- cglobal cpu_cpuid, 5,7 push rbx push r4 push r3 push r2 push r1 mov eax, r0d xor ecx, ecx cpuid pop r4 mov [r4], eax pop r4 mov [r4], ebx pop r4 mov [r4], ecx pop r4 mov [r4], edx pop rbx RET ;----------------------------------------------------------------------------- ; void cpu_xgetbv( int op, int *eax, int *edx ) ;----------------------------------------------------------------------------- cglobal cpu_xgetbv, 3,7 push r2 push r1 mov ecx, r0d xgetbv pop r4 mov [r4], eax pop r4 mov [r4], edx RET ;----------------------------------------------------------------------------- ; void cpuid_get_serial_number( int op, int *eax, int *ebx, int *ecx, int *edx ) ; 2017-06-18 luofl ;----------------------------------------------------------------------------- cglobal cpuid_get_serial_number, 5,7 push rbx push r4 push r3 push r2 push r1 ; first 64 bits mov eax, 00h xor edx, edx cpuid pop r4 mov [r4], edx pop r4 mov [r4], eax ; second 64 bits mov eax, 01h xor ecx, ecx xor edx, edx cpuid pop r4 mov [r4], edx pop r4 mov [r4], eax pop rbx RET %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void stack_align( void (*func)(void*), void *arg ); ;----------------------------------------------------------------------------- cglobal stack_align push rbp mov rbp, rsp %if WIN64 sub rsp, 32 ; shadow space %endif and rsp, ~31 mov rax, r0 mov r0, r1 mov r1, r2 mov r2, r3 call rax leave ret %else ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) ; return 0 if unsupported ;----------------------------------------------------------------------------- cglobal cpu_cpuid_test pushfd push ebx push ebp push esi push edi pushfd pop eax mov ebx, eax xor eax, 0x200000 push eax popfd pushfd pop eax xor eax, ebx pop edi pop esi pop ebp pop ebx popfd ret cglobal stack_align push ebp mov ebp, esp sub esp, 12 and esp, ~31 mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx mov edx, [ebp+16] mov [esp+4], edx mov edx, [ebp+20] mov [esp+8], edx call ecx leave ret %endif ;----------------------------------------------------------------------------- ; void cpu_emms( void ) ;----------------------------------------------------------------------------- cglobal cpu_emms emms ret ;----------------------------------------------------------------------------- ; void cpu_sfence( void ) ;----------------------------------------------------------------------------- cglobal cpu_sfence sfence ret %if 0 ; REMOVED cextern intel_cpu_indicator_init ;----------------------------------------------------------------------------- ; void safe_intel_cpu_indicator_init( void ); ;----------------------------------------------------------------------------- cglobal safe_intel_cpu_indicator_init push r0 push r1 push r2 push r3 push r4 push r5 push r6 %if ARCH_X86_64 push r7 push r8 push r9 push r10 push r11 push r12 push r13 push r14 %endif push rbp mov rbp, rsp %if WIN64 sub rsp, 32 ; shadow space %endif and rsp, ~31 call intel_cpu_indicator_init leave %if ARCH_X86_64 pop r14 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 pop r7 %endif pop r6 pop r5 pop r4 pop r3 pop r2 pop r1 pop r0 ret %endif ; if 0davs2-1.6/source/common/x86/dct8.asm000066400000000000000000002717751337322544400172120ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Nabajit Deka ;* Min Chen ;* Li Cao ;* Praveen Kumar Tiwari ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ ;TO-DO : Further optimize the routines. %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 ; ---------------------------------------------------------------------------- ; dct4 tab_dct4: times 4 dw 32, 32 times 4 dw 42, 17 times 4 dw 32, -32 times 4 dw 17, -42 avx2_idct4_1: dw 32, 32, 32, 32, 32, 32, 32, 32, 32, -32, 32, -32, 32, -32, 32, -32 dw 42, 17, 42, 17, 42, 17, 42, 17, 17, -42, 17, -42, 17, -42, 17, -42 avx2_idct4_2: dw 32, 32, 32,-32, 42, 17, 17,-42 idct4_shuf1: times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 idct4_shuf2: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11 ; ---------------------------------------------------------------------------- ; dct8 align 32 pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13 tab_idct8_1: times 1 dw 32, -32, 17, -42, 32, 32, 42, 17 tab_idct8_2: times 1 dw 44, 38, 25, 9, 38, -9, -44, -25 times 1 dw 25, -44, 9, 38, 9, -25, 38, -44 tab_idct8_3: times 4 dw 44, 38 times 4 dw 25, 9 times 4 dw 38, -9 times 4 dw -44, -25 times 4 dw 25, -44 times 4 dw 9, 38 times 4 dw 9, -25 times 4 dw 38, -44 avx2_idct8_1: times 4 dw 32, 42, 32, 17 times 4 dw 32, 17, -32, -42 times 4 dw 32, -17, -32, 42 times 4 dw 32, -42, 32, -17 avx2_idct8_2: times 4 dw 44, 38, 25, 9 times 4 dw 38, -9, -44, -25 times 4 dw 25, -44, 9, 38 times 4 dw 9, -25, 38, -44 align 32 idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7 idct8_shuf2: times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 ; ---------------------------------------------------------------------------- ; dct16 align 32 dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 tab_idct16_1: dw 45, 43, 40, 35, 29, 21, 13, 4 dw 43, 29, 4, -21, -40, -45, -35, -13 dw 40, 4, -35, -43, -13, 29, 45, 21 dw 35, -21, -43, 4, 45, 13, -40, -29 dw 29, -40, -13, 45, -4, -43, 21, 35 dw 21, -45, 29, 13, -43, 35, 4, -40 dw 13, -35, 45, -40, 21, 4, -29, 43 dw 4, -13, 21, -29, 35, -40, 43, -45 tab_idct16_2: dw 32, 44, 42, 38, 32, 25, 17, 9 dw 32, 38, 17, -9, -32, -44, -42, -25 dw 32, 25, -17, -44, -32, 9, 42, 38 dw 32, 9, -42, -25, 32, 38, -17, -44 dw 32, -9, -42, 25, 32, -38, -17, 44 dw 32, -25, -17, 44, -32, -9, 42, -38 dw 32, -38, 17, 9, -32, 44, -42, 25 dw 32, -44, 42, -38, 32, -25, 17, -9 idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7 idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 ; ---------------------------------------------------------------------------- ; dct32 align 32 tab_idct32_1: dw 45, 45, 44, 43, 41, 39, 36, 34, 30, 27, 23, 19, 15, 11, 7, 2 dw 45, 41, 34, 23, 11, -2, -15, -27, -36, -43, -45, -44, -39, -30, -19, -7 dw 44, 34, 15, -7, -27, -41, -45, -39, -23, -2, 19, 36, 45, 43, 30, 11 dw 43, 23, -7, -34, -45, -36, -11, 19, 41, 44, 27, -2, -30, -45, -39, -15 dw 41, 11, -27, -45, -30, 7, 39, 43, 15, -23, -45, -34, 2, 36, 44, 19 dw 39, -2, -41, -36, 7, 43, 34, -11, -44, -30, 15, 45, 27, -19, -45, -23 dw 36, -15, -45, -11, 39, 34, -19, -45, -7, 41, 30, -23, -44, -2, 43, 27 dw 34, -27, -39, 19, 43, -11, -45, 2, 45, 7, -44, -15, 41, 23, -36, -30 dw 30, -36, -23, 41, 15, -44, -7, 45, -2, -45, 11, 43, -19, -39, 27, 34 dw 27, -43, -2, 44, -23, -30, 41, 7, -45, 19, 34, -39, -11, 45, -15, -36 dw 23, -45, 19, 27, -45, 15, 30, -44, 11, 34, -43, 7, 36, -41, 2, 39 dw 19, -44, 36, -2, -34, 45, -23, -15, 43, -39, 7, 30, -45, 27, 11, -41 dw 15, -39, 45, -30, 2, 27, -44, 41, -19, -11, 36, -45, 34, -7, -23, 43 dw 11, -30, 43, -45, 36, -19, -2, 23, -39, 45, -41, 27, -7, -15, 34, -44 dw 7, -19, 30, -39, 44, -45, 43, -36, 27, -15, 2, 11, -23, 34, -41, 45 dw 2, -7, 11, -15, 19, -23, 27, -30, 34, -36, 39, -41, 43, -44, 45, -45 tab_idct32_2: dw 32, 44, 42, 38, 32, 25, 17, 9 dw 32, 38, 17, -9, -32, -44, -42, -25 dw 32, 25, -17, -44, -32, 9, 42, 38 dw 32, 9, -42, -25, 32, 38, -17, -44 dw 32, -9, -42, 25, 32, -38, -17, 44 dw 32, -25, -17, 44, -32, -9, 42, -38 dw 32, -38, 17, 9, -32, 44, -42, 25 dw 32, -44, 42, -38, 32, -25, 17, -9 tab_idct32_3: dw 45, 43, 40, 35, 29, 21, 13, 4 dw 43, 29, 4, -21, -40, -45, -35, -13 dw 40, 4, -35, -43, -13, 29, 45, 21 dw 35, -21, -43, 4, 45, 13, -40, -29 dw 29, -40, -13, 45, -4, -43, 21, 35 dw 21, -45, 29, 13, -43, 35, 4, -40 dw 13, -35, 45, -40, 21, 4, -29, 43 dw 4, -13, 21, -29, 35, -40, 43, -45 tab_idct32_4: dw 32, 45, 44, 43, 42, 40, 38, 35, 32, 29, 25, 21, 17, 13, 9, 4 dw 32, 43, 38, 29, 17, 4, -9, -21, -32, -40, -44, -45, -42, -35, -25, -13 dw 32, 40, 25, 4, -17, -35, -44, -43, -32, -13, 9, 29, 42, 45, 38, 21 dw 32, 35, 9, -21, -42, -43, -25, 4, 32, 45, 38, 13, -17, -40, -44, -29 dw 32, 29, -9, -40, -42, -13, 25, 45, 32, -4, -38, -43, -17, 21, 44, 35 dw 32, 21, -25, -45, -17, 29, 44, 13, -32, -43, -9, 35, 42, 4, -38, -40 dw 32, 13, -38, -35, 17, 45, 9, -40, -32, 21, 44, 4, -42, -29, 25, 43 dw 32, 4, -44, -13, 42, 21, -38, -29, 32, 35, -25, -40, 17, 43, -9, -45 dw 32, -4, -44, 13, 42, -21, -38, 29, 32, -35, -25, 40, 17, -43, -9, 45 dw 32, -13, -38, 35, 17, -45, 9, 40, -32, -21, 44, -4, -42, 29, 25, -43 dw 32, -21, -25, 45, -17, -29, 44, -13, -32, 43, -9, -35, 42, -4, -38, 40 dw 32, -29, -9, 40, -42, 13, 25, -45, 32, 4, -38, 43, -17, -21, 44, -35 dw 32, -35, 9, 21, -42, 43, -25, -4, 32, -45, 38, -13, -17, 40, -44, 29 dw 32, -40, 25, -4, -17, 35, -44, 43, -32, 13, 9, -29, 42, -45, 38, -21 dw 32, -43, 38, -29, 17, -4, -9, 21, -32, 40, -44, 45, -42, 35, -25, 13 dw 32, -45, 44, -43, 42, -40, 38, -35, 32, -29, 25, -21, 17, -13, 9, -4 ; ---------------------------------------------------------------------------- SECTION .text cextern pd_11 cextern pd_12 cextern pd_16 cextern pd_512 cextern pd_2048 ; ============================================================================ ; void idct_4x4(const coeff_t *src, coeff_t *dst, int i_dst) ; ============================================================================ ; ------------------------------------------------------------------ ; idct_4x4_sse2 INIT_XMM sse2 cglobal idct_4x4, 3, 4, 7 %define IDCT4_SHIFT1 5 ; shift1 = 5 %define IDCT4_OFFSET1 [pd_16] ; add1 = 16 %if BIT_DEPTH == 10 ; %define IDCT4_SHIFT2 10 ; %define IDCT4_OFFSET2 [pd_512] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT4_SHIFT2 12 ; shift2 = 12 %define IDCT4_OFFSET2 [pd_2048] ; add2 = 2048 %else ; %error Unsupported BIT_DEPTH! ; %endif ; add r2d, r2d ; r2 <-- i_dst lea r3, [tab_dct4] ; ; mova m6, IDCT4_OFFSET1 ; ; movu m0, [r0 + 0 * 16] ; mova??? movu m1, [r0 + 1 * 16] ; ; punpcklwd m2, m0, m1 ; pmaddwd m3, m2, [r3 + 0 * 16] ; m3 = E1 paddd m3, m6 ; ; pmaddwd m2, [r3 + 2 * 16] ; m2 = E2 paddd m2, m6 ; ; punpckhwd m0, m1 ; pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 ; paddd m4, m3, m1 ; psrad m4, IDCT4_SHIFT1 ; m4 = m128iA paddd m5, m2, m0 ; psrad m5, IDCT4_SHIFT1 ; packssdw m4, m5 ; m4 = m128iA ; psubd m2, m0 ; psrad m2, IDCT4_SHIFT1 ; psubd m3, m1 ; psrad m3, IDCT4_SHIFT1 ; packssdw m2, m3 ; m2 = m128iD ; punpcklwd m1, m4, m2 ; m1 = S0 punpckhwd m4, m2 ; m4 = S8 ; punpcklwd m0, m1, m4 ; m0 = m128iA punpckhwd m1, m4 ; m1 = m128iD ; mova m6, IDCT4_OFFSET2 ; ; punpcklwd m2, m0, m1 ; pmaddwd m3, m2, [r3 + 0 * 16] ; paddd m3, m6 ; m3 = E1 ; pmaddwd m2, [r3 + 2 * 16] ; paddd m2, m6 ; m2 = E2 ; punpckhwd m0, m1 ; pmaddwd m1, m0, [r3 + 1 * 16] ; m1 = O1 pmaddwd m0, [r3 + 3 * 16] ; m0 = O2 ; paddd m4, m3, m1 ; psrad m4, IDCT4_SHIFT2 ; m4 = m128iA paddd m5, m2, m0 ; psrad m5, IDCT4_SHIFT2 ; packssdw m4, m5 ; m4 = m128iA ; psubd m2, m0 ; psrad m2, IDCT4_SHIFT2 ; psubd m3, m1 ; psrad m3, IDCT4_SHIFT2 ; packssdw m2, m3 ; m2 = m128iD ; punpcklwd m1, m4, m2 ; punpckhwd m4, m2 ; ; punpcklwd m0, m1, m4 ; movlps [r1 + 0 * r2], m0 ; store dst, line 0 movhps [r1 + 1 * r2], m0 ; line 1 ; punpckhwd m1, m4 ; movlps [r1 + 2*r2], m1 ; store dst, line 2 lea r1, [r1 + 2*r2] ; movhps [r1 + r2], m1 ; line 3 ; RET ; %undef IDCT4_SHIFT1 %undef IDCT4_OFFSET1 %undef IDCT4_SHIFT2 %undef IDCT4_OFFSET2 ; ---------------------------------------------------------------------------- ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- INIT_XMM ssse3 cglobal patial_butterfly_inverse_internal_pass1 %define IDCT8_SHIFT1 5 ; shift1 = 5 %define IDCT8_ADD1 [pd_16] ; add1 = 16 ; movh m0, [r0 ] ; movhps m0, [r0 + 2 * 16] ; movh m1, [r0 + 4 * 16] ; movhps m1, [r0 + 6 * 16] ; ; punpckhwd m2, m0, m1 ; [2 6] punpcklwd m0, m1 ; [0 4] pmaddwd m1, m0, [r6 ] ; EE[0] pmaddwd m0, [r6 + 32] ; EE[1] pmaddwd m3, m2, [r6 + 16] ; EO[0] pmaddwd m2, [r6 + 48] ; EO[1] ; paddd m4, m1, m3 ; E[0] psubd m1, m3 ; E[3] paddd m3, m0, m2 ; E[1] psubd m0, m2 ; E[2] ; ; E[K] = E[k] + add ; mova m5, IDCT8_ADD1 ; add1 = 16 paddd m0, m5 ; paddd m1, m5 ; paddd m3, m5 ; paddd m4, m5 ; ; movh m2, [r0 + 16] ; movhps m2, [r0 + 5 * 16] ; movh m5, [r0 + 3 * 16] ; movhps m5, [r0 + 7 * 16] ; punpcklwd m6, m2, m5 ; [1 3] punpckhwd m2, m5 ; [5 7] ; pmaddwd m5, m6, [r4 ] ; pmaddwd m7, m2, [r4 + 16] ; paddd m5, m7 ; O[0] ; paddd m7, m4, m5 ; psrad m7, IDCT8_SHIFT1 ; shift1 = 5 ; psubd m4, m5 ; psrad m4, IDCT8_SHIFT1 ; shift1 = 5 ; packssdw m7, m4 ; movh [r5 + 0 * 16], m7 ; movhps [r5 + 7 * 16], m7 ; ; pmaddwd m5, m6, [r4 + 32] ; pmaddwd m4, m2, [r4 + 48] ; paddd m5, m4 ; O[1] ; paddd m4, m3, m5 ; psrad m4, IDCT8_SHIFT1 ; shift1 = 5 ; psubd m3, m5 ; psrad m3, IDCT8_SHIFT1 ; shift1 = 5 ; packssdw m4, m3 ; movh [r5 + 1 * 16], m4 ; movhps [r5 + 6 * 16], m4 ; ; pmaddwd m5, m6, [r4 + 64] ; pmaddwd m4, m2, [r4 + 80] ; paddd m5, m4 ; O[2] ; paddd m4, m0, m5 ; psrad m4, IDCT8_SHIFT1 ; shift1 = 5 ; psubd m0, m5 ; psrad m0, IDCT8_SHIFT1 ; shift1 = 5 ; packssdw m4, m0 ; movh [r5 + 2 * 16], m4 ; movhps [r5 + 5 * 16], m4 ; ; pmaddwd m5, m6, [r4 + 96] ; pmaddwd m4, m2, [r4 + 112] ; paddd m5, m4 ; O[3] ; paddd m4, m1, m5 ; psrad m4, IDCT8_SHIFT1 ; shift1 = 5 ; psubd m1, m5 ; psrad m1, IDCT8_SHIFT1 ; shift1 = 5 ; packssdw m4, m1 ; movh [r5 + 3 * 16], m4 ; movhps [r5 + 4 * 16], m4 ; ; %undef IDCT8_SHIFT1 ; %undef IDCT8_ADD1 ; ret ; %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1 %if BIT_DEPTH == 10 ; %define IDCT8_SHIFT2 10 ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT8_SHIFT2 12 ; shift2 = 12 %else ; %error Unsupported BIT_DEPTH! ; %endif ; pshufb m4, %1, [pb_idct8even] ; pmaddwd m4, [tab_idct8_1] ; phsubd m5, m4 ; pshufd m4, m4, 0x4E ; phaddd m4, m4 ; punpckhqdq m4, m5 ; m4 = dd e[ 0 1 2 3] paddd m4, m6 ; ; pshufb %1, %1, [r6] ; pmaddwd m5, %1, [r4] ; pmaddwd %1, [r4 + 16] ; phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3] ; paddd %1, m4, m5 ; psrad %1, IDCT8_SHIFT2 ; ; psubd m4, m5 ; psrad m4, IDCT8_SHIFT2 ; pshufd m4, m4, 0x1B ; ; packssdw %1, m4 ; %undef IDCT8_SHIFT2 ; %endmacro cglobal patial_butterfly_inverse_internal_pass2 mova m0, [r5 ] ; PARTIAL_BUTTERFLY_PROCESS_ROW m0 ; movu [r1 ], m0 ; ; mova m2, [r5 + 16] ; PARTIAL_BUTTERFLY_PROCESS_ROW m2 ; movu [r1 + r2], m2 ; ; mova m1, [r5 + 32] ; PARTIAL_BUTTERFLY_PROCESS_ROW m1 ; movu [r1 + 2*r2], m1 ; ; mova m3, [r5 + 48] ; PARTIAL_BUTTERFLY_PROCESS_ROW m3 ; movu [r1 + r3], m3 ; ; ret ; ; ------------------------------------------------------------------ ; idct_8x8_ssse3 cglobal idct_8x8, 3,7,8 ;,0-16*mmsize ; alignment stack to 64-bytes ; mov r5, rsp ; sub rsp, 16*mmsize + gprsize ; and rsp, ~(64-1) ; mov [rsp + 16*mmsize], r5 ; mov r5, rsp ; ; lea r4, [tab_idct8_3] ; lea r6, [tab_dct4] ; ; call patial_butterfly_inverse_internal_pass1 ; add r0, 8 ; add r5, 8 ; ; call patial_butterfly_inverse_internal_pass1 ; %if BIT_DEPTH == 10 ; mova m6, [pd_512] ; %elif BIT_DEPTH == 8 ; mova m6, [pd_2048] ; %else ; %error Unsupported BIT_DEPTH! ; %endif ; add r2, r2 ; lea r3, [r2 * 3] ; lea r4, [tab_idct8_2] ; lea r6, [pb_idct8odd] ; sub r5, 8 ; ; call patial_butterfly_inverse_internal_pass2 ; lea r1, [r1 + 4 * r2] ; add r5, 64 ; ; call patial_butterfly_inverse_internal_pass2 ; ; restore origin stack pointer ; mov rsp, [rsp + 16*mmsize] ; RET ; ; ============================================================================ ; ARCH_X86_64 ONLY ; ============================================================================ %if ARCH_X86_64 == 1 ; ---------------------------------------------------------------------------- ; void idct_4x4(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- INIT_YMM avx2 cglobal idct_4x4, 3, 4, 6 %define IDCT4_SHIFT1 5 ; shift1 = 5 vbroadcasti128 m4, [pd_16] ; add1 = 16 %if BIT_DEPTH == 10 ; %define IDCT4_SHIFT2 10 ; vpbroadcastd m5, [pd_512] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT4_SHIFT2 12 ; shift2 = 12 vpbroadcastd m5, [pd_2048] ; add2 = 2048 %else ; %error Unsupported BIT_DEPTH! ; %endif ; ; add r2, r2 ; r2 <-- i_src (src is 16bit data) lea r3, [r2 * 3] ; r3 <-- 3 * i_src ; movu m0, [r0] ; [00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33] ; pshufb m0, [idct4_shuf1] ; [00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33] vextracti128 xm1, m0, 1 ; [20 22 21 23 30 32 31 33] punpcklwd xm2, xm0, xm1 ; [00 20 02 22 01 21 03 23] punpckhwd xm0, xm1 ; [10 30 12 32 11 31 13 33] vinserti128 m2, m2, xm2, 1 ; [00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23] vinserti128 m0, m0, xm0, 1 ; [10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33] ; mova m1, [avx2_idct4_1 ] ; mova m3, [avx2_idct4_1 + 32] ; pmaddwd m1, m2 ; pmaddwd m3, m0 ; ; paddd m0, m1, m3 ; paddd m0, m4 ; psrad m0, IDCT4_SHIFT1 ; [00 20 10 30 01 21 11 31] ; psubd m1, m3 ; paddd m1, m4 ; psrad m1, IDCT4_SHIFT1 ; [03 23 13 33 02 22 12 32] ; packssdw m0, m1 ; [00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32] vmovshdup m1, m0 ; [10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32] vmovsldup m0, m0 ; [00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22] ; vpbroadcastq m2, [avx2_idct4_2 ] ; vpbroadcastq m3, [avx2_idct4_2 + 8] ; pmaddwd m0, m2 ; pmaddwd m1, m3 ; ; paddd m2, m0, m1 ; paddd m2, m5 ; psrad m2, IDCT4_SHIFT2 ; [00 01 10 11 30 31 20 21] ; psubd m0, m1 ; paddd m0, m5 ; psrad m0, IDCT4_SHIFT2 ; [03 02 13 12 33 32 23 22] ; pshufb m0, [idct4_shuf2] ; [02 03 12 13 32 33 22 23] punpcklqdq m1, m2, m0 ; [00 01 02 03 10 11 12 13] punpckhqdq m2, m0 ; [30 31 32 33 20 21 22 23] packssdw m1, m2 ; [00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23] vextracti128 xm0, m1, 1 ; ; movq [r1 ], xm1 ; store result, line 0 movq [r1 + r2], xm0 ; store result, line 1 movhps [r1 + 2*r2], xm0 ; store result, line 2 movhps [r1 + r3], xm1 ; store result, line 3 RET ; %undef IDCT4_SHIFT1 %undef IDCT4_SHIFT2 %macro IDCT8_PASS_1 1 vpbroadcastd m7, [r5 + %1 ] ; vpbroadcastd m10, [r5 + %1 + 4] ; pmaddwd m5, m4, m7 ; pmaddwd m6, m0, m10 ; paddd m5, m6 ; ; vpbroadcastd m7, [r6 + %1 ] ; vpbroadcastd m10, [r6 + %1 + 4] ; pmaddwd m6, m1, m7 ; pmaddwd m3, m2, m10 ; paddd m6, m3 ; ; paddd m3, m5, m6 ; paddd m3, m11 ; psrad m3, IDCT8_SHIFT1 ; ; psubd m5, m6 ; paddd m5, m11 ; psrad m5, IDCT8_SHIFT1 ; ; vpbroadcastd m7, [r5 + %1 + 32] ; vpbroadcastd m10, [r5 + %1 + 36] ; pmaddwd m6, m4, m7 ; pmaddwd m8, m0, m10 ; paddd m6, m8 ; ; vpbroadcastd m7, [r6 + %1 + 32] ; vpbroadcastd m10, [r6 + %1 + 36] ; pmaddwd m8, m1, m7 ; pmaddwd m9, m2, m10 ; paddd m8, m9 ; ; paddd m9, m6, m8 ; paddd m9, m11 ; psrad m9, IDCT8_SHIFT1 ; ; psubd m6, m8 ; paddd m6, m11 ; psrad m6, IDCT8_SHIFT1 ; ; packssdw m3, m9 ; vpermq m3, m3, 0xD8 ; ; packssdw m6, m5 ; vpermq m6, m6, 0xD8 ; %endmacro %macro IDCT8_PASS_2 0 punpcklqdq m2, m0, m1 ; punpckhqdq m0, m1 ; ; pmaddwd m3, m2, [r5 ] ; pmaddwd m5, m2, [r5 + 32] ; pmaddwd m6, m2, [r5 + 64] ; pmaddwd m7, m2, [r5 + 96] ; phaddd m3, m5 ; phaddd m6, m7 ; pshufb m3, [idct8_shuf2] ; pshufb m6, [idct8_shuf2] ; punpcklqdq m7, m3, m6 ; punpckhqdq m3, m6 ; ; pmaddwd m5, m0, [r6 ] ; pmaddwd m6, m0, [r6 + 32] ; pmaddwd m8, m0, [r6 + 64] ; pmaddwd m9, m0, [r6 + 96] ; phaddd m5, m6 ; phaddd m8, m9 ; pshufb m5, [idct8_shuf2] ; pshufb m8, [idct8_shuf2] ; punpcklqdq m6, m5, m8 ; punpckhqdq m5, m8 ; ; paddd m8, m7, m6 ; paddd m8, m12 ; psrad m8, IDCT8_SHIFT2 ; ; psubd m7, m6 ; paddd m7, m12 ; psrad m7, IDCT8_SHIFT2 ; ; pshufb m7, [idct8_shuf3] ; packssdw m8, m7 ; ; paddd m9, m3, m5 ; paddd m9, m12 ; psrad m9, IDCT8_SHIFT2 ; ; psubd m3, m5 ; paddd m3, m12 ; psrad m3, IDCT8_SHIFT2 ; ; pshufb m3, [idct8_shuf3] ; packssdw m9, m3 ; %endmacro ; ---------------------------------------------------------------------------- ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- ; ------------------------------------------------------------------ ; idct_8x8_sse2 INIT_XMM sse2 %define IDCT8_SHIFT1 5 ; shift1 = 5 %define IDCT8_ADD1 [pd_16] ; add1 = 16 %if BIT_DEPTH == 10 ; %define IDCT8_SHIFT2 10 ; %define IDCT8_ADD2 [pd_512] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT8_SHIFT2 12 ; shift2 = 12 %define IDCT8_ADD2 [pd_2048] ; add2 = 2048 %else ; %error Unsupported BIT_DEPTH! ; %endif ; cglobal idct_8x8, 3, 6, 16, 0-5*mmsize mova m9, [r0 + 1*mmsize] ; mova m1, [r0 + 3*mmsize] ; mova m7, m9 ; punpcklwd m7, m1 ; punpckhwd m9, m1 ; mova m14, [tab_idct8_3] ; mova m3, m14 ; pmaddwd m14, m7 ; pmaddwd m3, m9 ; mova m0, [r0 + 5*mmsize] ; mova m10, [r0 + 7*mmsize] ; mova m2, m0 ; punpcklwd m2, m10 ; punpckhwd m0, m10 ; mova m15, [tab_idct8_3+1*mmsize] ; mova m11, [tab_idct8_3+1*mmsize] ; pmaddwd m15, m2 ; mova m4, [tab_idct8_3+2*mmsize] ; pmaddwd m11, m0 ; mova m1, [tab_idct8_3+2*mmsize] ; paddd m15, m14 ; mova m5, [tab_idct8_3+4*mmsize] ; mova m12, [tab_idct8_3+4*mmsize] ; paddd m11, m3 ; mova [rsp + 0*mmsize], m11 ; mova [rsp + 1*mmsize], m15 ; pmaddwd m4, m7 ; pmaddwd m1, m9 ; mova m14, [tab_idct8_3+3*mmsize] ; mova m3, [tab_idct8_3+3*mmsize] ; pmaddwd m14, m2 ; pmaddwd m3, m0 ; paddd m14, m4 ; paddd m3, m1 ; mova [rsp + 2*mmsize], m3 ; pmaddwd m5, m9 ; pmaddwd m9, [tab_idct8_3+6*mmsize] ; mova m6, [tab_idct8_3+5*mmsize] ; pmaddwd m12, m7 ; pmaddwd m7, [tab_idct8_3+6*mmsize] ; mova m4, [tab_idct8_3+5*mmsize] ; pmaddwd m6, m2 ; paddd m6, m12 ; pmaddwd m2, [tab_idct8_3+7*mmsize] ; paddd m7, m2 ; mova [rsp + 3*mmsize], m6 ; pmaddwd m4, m0 ; pmaddwd m0, [tab_idct8_3+7*mmsize] ; paddd m9, m0 ; paddd m5, m4 ; mova m6, [r0 + 0*mmsize] ; mova m0, [r0 + 4*mmsize] ; mova m4, m6 ; punpcklwd m4, m0 ; punpckhwd m6, m0 ; mova m12, [r0 + 2*mmsize] ; mova m0, [r0 + 6*mmsize] ; mova m13, m12 ; mova m8, [tab_dct4] ; punpcklwd m13, m0 ; mova m10, [tab_dct4] ; punpckhwd m12, m0 ; pmaddwd m8, m4 ; mova m3, m8 ; pmaddwd m4, [tab_dct4 + 2*mmsize] ; pmaddwd m10, m6 ; mova m2, [tab_dct4 + 1*mmsize] ; mova m1, m10 ; pmaddwd m6, [tab_dct4 + 2*mmsize] ; mova m0, [tab_dct4 + 1*mmsize] ; pmaddwd m2, m13 ; paddd m3, m2 ; psubd m8, m2 ; mova m2, m6 ; pmaddwd m13, [tab_dct4 + 3*mmsize] ; pmaddwd m0, m12 ; paddd m1, m0 ; psubd m10, m0 ; mova m0, m4 ; pmaddwd m12, [tab_dct4 + 3*mmsize] ; paddd m3, IDCT8_ADD1 ; add1 = 16 paddd m1, IDCT8_ADD1 ; add1 = 16 paddd m8, IDCT8_ADD1 ; add1 = 16 paddd m10, IDCT8_ADD1 ; add1 = 16 paddd m0, m13 ; paddd m2, m12 ; paddd m0, IDCT8_ADD1 ; add1 = 16 paddd m2, IDCT8_ADD1 ; add1 = 16 psubd m4, m13 ; psubd m6, m12 ; paddd m4, IDCT8_ADD1 ; add1 = 16 paddd m6, IDCT8_ADD1 ; add1 = 16 mova m12, m8 ; psubd m8, m7 ; psrad m8, IDCT8_SHIFT1 ; shift1 = 5 paddd m15, m3 ; psubd m3, [rsp + 1*mmsize] ; psrad m15, IDCT8_SHIFT1 ; shift1 = 5 paddd m12, m7 ; psrad m12, IDCT8_SHIFT1 ; shift1 = 5 paddd m11, m1 ; mova m13, m14 ; psrad m11, IDCT8_SHIFT1 ; shift1 = 5 packssdw m15, m11 ; psubd m1, [rsp + 0*mmsize] ; psrad m1, IDCT8_SHIFT1 ; shift1 = 5 mova m11, [rsp + 2*mmsize] ; paddd m14, m0 ; psrad m14, IDCT8_SHIFT1 ; shift1 = 5 psubd m0, m13 ; psrad m0, IDCT8_SHIFT1 ; shift1 = 5 paddd m11, m2 ; mova m13, [rsp + 3*mmsize] ; psrad m11, IDCT8_SHIFT1 ; shift1 = 5 packssdw m14, m11 ; mova m11, m6 ; psubd m6, m5 ; paddd m13, m4 ; psrad m13, IDCT8_SHIFT1 ; shift1 = 5 psrad m6, IDCT8_SHIFT1 ; shift1 = 5 paddd m11, m5 ; psrad m11, IDCT8_SHIFT1 ; shift1 = 5 packssdw m13, m11 ; mova m11, m10 ; psubd m4, [rsp + 3*mmsize] ; psubd m10, m9 ; psrad m4, IDCT8_SHIFT1 ; shift1 = 5 psrad m10, IDCT8_SHIFT1 ; shift1 = 5 packssdw m4, m6 ; packssdw m8, m10 ; paddd m11, m9 ; psrad m11, IDCT8_SHIFT1 ; shift1 = 5 packssdw m12, m11 ; psubd m2, [rsp + 2*mmsize] ; mova m5, m15 ; psrad m2, IDCT8_SHIFT1 ; shift1 = 5 packssdw m0, m2 ; mova m2, m14 ; psrad m3, IDCT8_SHIFT1 ; shift1 = 5 packssdw m3, m1 ; mova m6, m13 ; punpcklwd m5, m8 ; punpcklwd m2, m4 ; mova m1, m12 ; punpcklwd m6, m0 ; punpcklwd m1, m3 ; mova m9, m5 ; punpckhwd m13, m0 ; mova m0, m2 ; punpcklwd m9, m6 ; punpckhwd m5, m6 ; punpcklwd m0, m1 ; punpckhwd m2, m1 ; punpckhwd m15, m8 ; mova m1, m5 ; punpckhwd m14, m4 ; punpckhwd m12, m3 ; mova m6, m9 ; punpckhwd m9, m0 ; punpcklwd m1, m2 ; mova m4, [tab_idct8_3+0*mmsize] ; punpckhwd m5, m2 ; punpcklwd m6, m0 ; mova m2, m15 ; mova m0, m14 ; mova m7, m9 ; punpcklwd m2, m13 ; punpcklwd m0, m12 ; punpcklwd m7, m5 ; punpckhwd m14, m12 ; mova m10, m2 ; punpckhwd m15, m13 ; punpckhwd m9, m5 ; pmaddwd m4, m7 ; mova m13, m1 ; punpckhwd m2, m0 ; punpcklwd m10, m0 ; mova m0, m15 ; punpckhwd m15, m14 ; mova m12, m1 ; mova m3, [tab_idct8_3+0*mmsize] ; punpcklwd m0, m14 ; pmaddwd m3, m9 ; mova m11, m2 ; punpckhwd m2, m15 ; punpcklwd m11, m15 ; mova m8, [tab_idct8_3+1*mmsize] ; punpcklwd m13, m0 ; punpckhwd m12, m0 ; pmaddwd m8, m11 ; paddd m8, m4 ; mova [rsp + 4*mmsize], m8 ; mova m4, [tab_idct8_3+2*mmsize] ; pmaddwd m4, m7 ; mova m15, [tab_idct8_3+2*mmsize] ; mova m5, [tab_idct8_3+1*mmsize] ; pmaddwd m15, m9 ; pmaddwd m5, m2 ; paddd m5, m3 ; mova [rsp + 3*mmsize], m5 ; mova m14, [tab_idct8_3+3*mmsize] ; mova m5, [tab_idct8_3+3*mmsize] ; pmaddwd m14, m11 ; paddd m14, m4 ; mova [rsp + 2*mmsize], m14 ; pmaddwd m5, m2 ; paddd m5, m15 ; mova [rsp + 1*mmsize], m5 ; mova m15, [tab_idct8_3+4*mmsize] ; mova m5, [tab_idct8_3+4*mmsize] ; pmaddwd m15, m7 ; pmaddwd m7, [tab_idct8_3+6*mmsize] ; pmaddwd m5, m9 ; pmaddwd m9, [tab_idct8_3+6*mmsize] ; mova m4, [tab_idct8_3+5*mmsize] ; pmaddwd m4, m2 ; paddd m5, m4 ; mova m4, m6 ; mova m8, [tab_idct8_3+5*mmsize] ; punpckhwd m6, m10 ; pmaddwd m2, [tab_idct8_3+7*mmsize] ; punpcklwd m4, m10 ; paddd m9, m2 ; pmaddwd m8, m11 ; mova m10, [tab_dct4] ; paddd m8, m15 ; pmaddwd m11, [tab_idct8_3+7*mmsize] ; paddd m7, m11 ; mova [rsp + 0*mmsize], m8 ; pmaddwd m10, m6 ; pmaddwd m6, [tab_dct4 + 2*mmsize] ; mova m1, m10 ; mova m8, [tab_dct4] ; mova m3, [tab_dct4 + 1*mmsize] ; pmaddwd m8, m4 ; pmaddwd m4, [tab_dct4 + 2*mmsize] ; mova m0, m8 ; mova m2, [tab_dct4 + 1*mmsize] ; pmaddwd m3, m13 ; psubd m8, m3 ; paddd m0, m3 ; mova m3, m6 ; pmaddwd m13, [tab_dct4 + 3*mmsize] ; pmaddwd m2, m12 ; paddd m1, m2 ; psubd m10, m2 ; mova m2, m4 ; pmaddwd m12, [tab_dct4 + 3*mmsize] ; paddd m0, IDCT8_ADD2 ; add2 = 2048 paddd m1, IDCT8_ADD2 ; add2 = 2048 paddd m8, IDCT8_ADD2 ; add2 = 2048 paddd m10, IDCT8_ADD2 ; add2 = 2048 paddd m2, m13 ; paddd m3, m12 ; paddd m2, IDCT8_ADD2 ; add2 = 2048 paddd m3, IDCT8_ADD2 ; add2 = 2048 psubd m4, m13 ; psubd m6, m12 ; paddd m4, IDCT8_ADD2 ; add2 = 2048 paddd m6, IDCT8_ADD2 ; add2 = 2048 mova m15, [rsp + 4*mmsize] ; mova m12, m8 ; psubd m8, m7 ; psrad m8, IDCT8_SHIFT2 ; shift2 = 12 mova m11, [rsp + 3*mmsize] ; paddd m15, m0 ; psrad m15, IDCT8_SHIFT2 ; shift2 = 12 psubd m0, [rsp + 4*mmsize] ; psrad m0, IDCT8_SHIFT2 ; shift2 = 12 paddd m12, m7 ; paddd m11, m1 ; mova m14, [rsp + 2*mmsize] ; psrad m11, IDCT8_SHIFT2 ; shift2 = 12 packssdw m15, m11 ; psubd m1, [rsp + 3*mmsize] ; psrad m1, IDCT8_SHIFT2 ; shift2 = 12 mova m11, [rsp + 1*mmsize] ; paddd m14, m2 ; psrad m14, IDCT8_SHIFT2 ; shift2 = 12 packssdw m0, m1 ; psrad m12, IDCT8_SHIFT2 ; shift2 = 12 psubd m2, [rsp + 2*mmsize] ; paddd m11, m3 ; mova m13, [rsp + 0*mmsize] ; psrad m11, IDCT8_SHIFT2 ; shift2 = 12 packssdw m14, m11 ; mova m11, m6 ; psubd m6, m5 ; paddd m13, m4 ; psrad m13, IDCT8_SHIFT2 ; shift2 = 12 mova m1, m15 ; paddd m11, m5 ; psrad m11, IDCT8_SHIFT2 ; shift2 = 12 packssdw m13, m11 ; mova m11, m10 ; psubd m10, m9 ; psrad m10, IDCT8_SHIFT2 ; shift2 = 12 packssdw m8, m10 ; psrad m6, IDCT8_SHIFT2 ; shift2 = 12 psubd m4, [rsp + 0*mmsize] ; paddd m11, m9 ; psrad m11, IDCT8_SHIFT2 ; shift2 = 12 packssdw m12, m11 ; punpcklwd m1, m14 ; mova m5, m13 ; psrad m4, IDCT8_SHIFT2 ; shift2 = 12 packssdw m4, m6 ; psubd m3, [rsp + 1*mmsize] ; psrad m2, IDCT8_SHIFT2 ; shift2 = 12 mova m6, m8 ; psrad m3, IDCT8_SHIFT2 ; shift2 = 12 punpcklwd m5, m12 ; packssdw m2, m3 ; punpcklwd m6, m4 ; punpckhwd m8, m4 ; mova m4, m1 ; mova m3, m2 ; punpckhdq m1, m5 ; punpckldq m4, m5 ; punpcklwd m3, m0 ; punpckhwd m2, m0 ; mova m0, m6 ; lea r2, [r2 + r2] ; lea r4, [r2 + r2] ; lea r3, [r4 + r2] ; lea r4, [r4 + r3] ; lea r0, [r4 + 2*r2] ; movq [r1], m4 ; punpckhwd m15, m14 ; movhps [r1 + r2], m4 ; punpckhdq m0, m3 ; movq [r1 + 2*r2], m1 ; punpckhwd m13, m12 ; movhps [r1 + r3], m1 ; mova m1, m6 ; punpckldq m1, m3 ; movq [r1 + 8], m1 ; movhps [r1 + r2 + 8], m1 ; movq [r1 + 2*r2 + 8], m0 ; movhps [r1 + r3 + 8], m0 ; mova m0, m15 ; punpckhdq m15, m13 ; punpckldq m0, m13 ; movq [r1 + 4*r2], m0 ; movhps [r1 + r4], m0 ; mova m0, m8 ; punpckhdq m8, m2 ; movq [r1 + 2*r3], m15 ; punpckldq m0, m2 ; movhps [r1 + r0 ], m15 ; movq [r1 + 4*r2 + 8], m0 ; movhps [r1 + r4 + 8], m0 ; movq [r1 + 2*r3 + 8], m8 ; movhps [r1 + r0 + 8], m8 ; RET ; %undef IDCT8_SHIFT1 %undef IDCT8_SHIFT2 %undef IDCT8_ADD1 %undef IDCT8_ADD2 ; ---------------------------------------------------------------------------- ; void idct_8x8(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- ; ------------------------------------------------------------------ ; idct_8x8_avx2 INIT_YMM avx2 cglobal idct_8x8, 3, 7, 13, 0-8*16 %define IDCT8_SHIFT1 5 ; shift1 = 5 %define IDCT8_ADD1 [pd_16] ; add1 = 16 %if BIT_DEPTH == 10 ; %define IDCT8_SHIFT2 10 ; vpbroadcastd m12, [pd_512] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT8_SHIFT2 12 ; shift2 = 12 vpbroadcastd m12, [pd_2048] ; add1 = 2048 %else ; %error Unsupported BIT_DEPTH! ; %endif ; ; vbroadcasti128 m11, IDCT8_ADD1 ; add1 = 16 ; mov r4, rsp ; lea r5, [avx2_idct8_1] ; lea r6, [avx2_idct8_2] ; ; ;pass1 ; movu m1, [r0 + 0 * 32] ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1] movu m0, [r0 + 1 * 32] ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3] vpunpcklwd m5, m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vpunpckhwd m1, m0 ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3] vinserti128 m4, m5, xm1, 1 ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2] vextracti128 xm2, m5, 1 ; [1 3 1 3 1 3 1 3] vinserti128 m1, m1, xm2, 0 ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3] ; movu m2, [r0 + 2 * 32] ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5] movu m0, [r0 + 3 * 32] ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7] vpunpcklwd m5, m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vpunpckhwd m2, m0 ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7] vinserti128 m0, m5, xm2, 1 ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6] vextracti128 xm5, m5, 1 ; [5 7 5 7 5 7 5 7] vinserti128 m2, m2, xm5, 0 ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7] ; mova m5, [idct8_shuf1] ; vpermd m4, m5, m4 ; vpermd m0, m5, m0 ; vpermd m1, m5, m1 ; vpermd m2, m5, m2 ; ; IDCT8_PASS_1 0 ; mova [r4 ], m3 ; mova [r4 + 96], m6 ; ; IDCT8_PASS_1 64 ; mova [r4 + 32], m3 ; mova [r4 + 64], m6 ; ; ;pass2 ; add r2d, r2d ; lea r3, [r2 * 3] ; ; mova m0, [r4 ] ; mova m1, [r4 + 32] ; IDCT8_PASS_2 ; ; vextracti128 xm3, m8, 1 ; movu [r1 ], xm8 ; movu [r1 + r2], xm3 ; vextracti128 xm3, m9, 1 ; movu [r1 + 2*r2], xm9 ; movu [r1 + r3], xm3 ; ; lea r1, [r1 + r2 * 4] ; mova m0, [r4 + 64] ; mova m1, [r4 + 96] ; IDCT8_PASS_2 ; ; vextracti128 xm3, m8, 1 ; movu [r1 ], xm8 ; movu [r1 + r2], xm3 ; vextracti128 xm3, m9, 1 ; movu [r1 + 2*r2], xm9 ; movu [r1 + r3], xm3 ; RET ; %undef IDCT8_SHIFT1 %undef IDCT8_SHIFT2 %undef IDCT8_ADD1 %undef IDCT8_ADD2 %macro IDCT16_PASS1 2 vbroadcasti128 m5, [tab_idct16_2 + %1 * 16] pmaddwd m9, m0, m5 ; pmaddwd m10, m7, m5 ; phaddd m9, m10 ; ; pmaddwd m10, m6, m5 ; pmaddwd m11, m8, m5 ; phaddd m10, m11 ; ; phaddd m9, m10 ; vbroadcasti128 m5, [tab_idct16_1 + %1*16] ; ; pmaddwd m10, m1, m5 ; pmaddwd m11, m3, m5 ; phaddd m10, m11 ; ; pmaddwd m11, m4, m5 ; pmaddwd m12, m2, m5 ; phaddd m11, m12 ; ; phaddd m10, m11 ; ; paddd m11, m9, m10 ; paddd m11, m14 ; psrad m11, IDCT16_SHIFT1 ; ; psubd m9, m10 ; paddd m9, m14 ; psrad m9, IDCT16_SHIFT1 ; ; vbroadcasti128 m5, [tab_idct16_2 + %1*16 + 16] ; pmaddwd m10, m0, m5 ; pmaddwd m12, m7, m5 ; phaddd m10, m12 ; ; pmaddwd m12, m6, m5 ; pmaddwd m13, m8, m5 ; phaddd m12, m13 ; ; phaddd m10, m12 ; vbroadcasti128 m5, [tab_idct16_1 + %1 * 16 + 16] ; pmaddwd m12, m1, m5 ; pmaddwd m13, m3, m5 ; phaddd m12, m13 ; ; pmaddwd m13, m4, m5 ; pmaddwd m5, m2 ; phaddd m13, m5 ; ; phaddd m12, m13 ; ; paddd m5, m10, m12 ; paddd m5, m14 ; psrad m5, IDCT16_SHIFT1 ; ; psubd m10, m12 ; paddd m10, m14 ; psrad m10, IDCT16_SHIFT1 ; ; packssdw m11, m5 ; packssdw m9, m10 ; ; mova m10, [idct16_shuff] ; mova m5, [idct16_shuff1] ; ; vpermd m12, m10, m11 ; vpermd m13, m5, m9 ; mova [r3 + %1*16*2 ], xm12 ; mova [r3 + %2*16*2 ], xm13 ; vextracti128 [r3 + %2*16*2 + 32], m13, 1 ; vextracti128 [r3 + %1*16*2 + 32], m12, 1 ; %endmacro ; ---------------------------------------------------------------------------- ; void idct_16x16(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- ; ------------------------------------------------------------------ ; idct_16x16_avx2 INIT_YMM avx2 cglobal idct_16x16, 3, 7, 16, 0-16*mmsize %define IDCT16_SHIFT1 5 ; shift1 = 5 %define IDCT16_ADD1 [pd_16] ; add1 = 16 %if BIT_DEPTH == 10 ; %define IDCT16_SHIFT2 10 ; vpbroadcastd m15, [pd_512] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 %define IDCT16_SHIFT2 12 ; shift2 = 12 vpbroadcastd m15, [pd_2048] ; add2 = 2048 %else ; %error Unsupported BIT_DEPTH! ; %endif ; ; vbroadcasti128 m14, IDCT16_ADD1 ; add1 = 16 ; add r2d, r2d ; mov r3, rsp ; mov r4d, 2 ; ; .pass1: ; movu xm0, [r0 + 0 * 32] ; movu xm1, [r0 + 8 * 32] ; punpckhqdq xm2, xm0, xm1 ; punpcklqdq xm0, xm1 ; vinserti128 m0, m0, xm2, 1 ; ; movu xm1, [r0 + 1 * 32] ; movu xm2, [r0 + 9 * 32] ; punpckhqdq xm3, xm1, xm2 ; punpcklqdq xm1, xm2 ; vinserti128 m1, m1, xm3, 1 ; ; movu xm2, [r0 + 2 * 32] ; movu xm3, [r0 + 10 * 32] ; punpckhqdq xm4, xm2, xm3 ; punpcklqdq xm2, xm3 ; vinserti128 m2, m2, xm4, 1 ; ; movu xm3, [r0 + 3 * 32] ; movu xm4, [r0 + 11 * 32] ; punpckhqdq xm5, xm3, xm4 ; punpcklqdq xm3, xm4 ; vinserti128 m3, m3, xm5, 1 ; ; movu xm4, [r0 + 4 * 32] ; movu xm5, [r0 + 12 * 32] ; punpckhqdq xm6, xm4, xm5 ; punpcklqdq xm4, xm5 ; vinserti128 m4, m4, xm6, 1 ; ; movu xm5, [r0 + 5 * 32] ; movu xm6, [r0 + 13 * 32] ; punpckhqdq xm7, xm5, xm6 ; punpcklqdq xm5, xm6 ; vinserti128 m5, m5, xm7, 1 ; ; movu xm6, [r0 + 6 * 32] ; movu xm7, [r0 + 14 * 32] ; punpckhqdq xm8, xm6, xm7 ; punpcklqdq xm6, xm7 ; vinserti128 m6, m6, xm8, 1 ; ; movu xm7, [r0 + 7 * 32] ; movu xm8, [r0 + 15 * 32] ; punpckhqdq xm9, xm7, xm8 ; punpcklqdq xm7, xm8 ; vinserti128 m7, m7, xm9, 1 ; ; punpckhwd m8, m0, m2 ; [8 10] punpcklwd m0, m2 ; [0 2] ; punpckhwd m2, m1, m3 ; [9 11] punpcklwd m1, m3 ; [1 3] ; punpckhwd m3, m4, m6 ; [12 14] punpcklwd m4, m6 ; [4 6] ; punpckhwd m6, m5, m7 ; [13 15] punpcklwd m5, m7 ; [5 7] ; punpckhdq m7, m0, m4 ; [02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67] punpckldq m0, m4 ; [00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65] ; punpckhdq m4, m8, m3 ; [82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147] punpckldq m8, m3 ; [80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145] ; punpckhdq m3, m1, m5 ; [12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77] punpckldq m1, m5 ; [10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75] ; punpckhdq m5, m2, m6 ; [92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157] punpckldq m2, m6 ; [90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155] ; punpckhqdq m6, m0, m8 ; [01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145] punpcklqdq m0, m8 ; [00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144] ; punpckhqdq m8, m7, m4 ; [03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147] punpcklqdq m7, m4 ; [02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146] ; punpckhqdq m4, m1, m2 ; [11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155] punpcklqdq m1, m2 ; [10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154] ; punpckhqdq m2, m3, m5 ; [13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157] punpcklqdq m3, m5 ; [12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156] ; IDCT16_PASS1 0, 14 ; IDCT16_PASS1 2, 12 ; IDCT16_PASS1 4, 10 ; IDCT16_PASS1 6, 8 ; ; add r0, 16 ; add r3, 16 ; dec r4d ; jnz .pass1 ; ; mov r3, rsp ; mov r4d, 8 ; lea r5, [tab_idct16_2] ; lea r6, [tab_idct16_1] ; ; vbroadcasti128 m7, [r5 ] ; vbroadcasti128 m8, [r5 + 16] ; vbroadcasti128 m9, [r5 + 32] ; vbroadcasti128 m10, [r5 + 48] ; vbroadcasti128 m11, [r5 + 64] ; vbroadcasti128 m12, [r5 + 80] ; vbroadcasti128 m13, [r5 + 96] ; ; .pass2: ; movu m1, [r3] ; vpermq m0, m1, 0xD8 ; ; pmaddwd m1, m0, m7 ; pmaddwd m2, m0, m8 ; phaddd m1, m2 ; ; pmaddwd m2, m0, m9 ; pmaddwd m3, m0, m10 ; phaddd m2, m3 ; ; phaddd m1, m2 ; ; pmaddwd m2, m0, m11 ; pmaddwd m3, m0, m12 ; phaddd m2, m3 ; ; vbroadcasti128 m14, [r5 + 112] ; pmaddwd m3, m0, m13 ; pmaddwd m4, m0, m14 ; phaddd m3, m4 ; ; phaddd m2, m3 ; ; movu m3, [r3 + 32] ; vpermq m0, m3, 0xD8 ; ; vbroadcasti128 m14, [r6] ; pmaddwd m3, m0, m14 ; vbroadcasti128 m14, [r6 + 16] ; pmaddwd m4, m0, m14 ; phaddd m3, m4 ; ; vbroadcasti128 m14, [r6 + 32] ; pmaddwd m4, m0, m14 ; vbroadcasti128 m14, [r6 + 48] ; pmaddwd m5, m0, m14 ; phaddd m4, m5 ; ; phaddd m3, m4 ; ; vbroadcasti128 m14, [r6 + 64] ; pmaddwd m4, m0, m14 ; vbroadcasti128 m14, [r6 + 80] ; pmaddwd m5, m0, m14 ; phaddd m4, m5 ; ; vbroadcasti128 m14, [r6 + 96] ; pmaddwd m6, m0, m14 ; vbroadcasti128 m14, [r6 + 112] ; pmaddwd m0, m14 ; phaddd m6, m0 ; ; phaddd m4, m6 ; ; paddd m5, m1, m3 ; paddd m5, m15 ; psrad m5, IDCT16_SHIFT2 ; ; psubd m1, m3 ; paddd m1, m15 ; psrad m1, IDCT16_SHIFT2 ; ; paddd m6, m2, m4 ; paddd m6, m15 ; psrad m6, IDCT16_SHIFT2 ; ; psubd m2, m4 ; paddd m2, m15 ; psrad m2, IDCT16_SHIFT2 ; ; packssdw m5, m6 ; packssdw m1, m2 ; pshufb m2, m1, [dct16_shuf1] ; ; mova [r1 ], xm5 ; mova [r1 + 16], xm2 ; vextracti128 [r1 + r2 ], m5, 1 ; vextracti128 [r1 + r2 + 16], m2, 1 ; ; lea r1, [r1 + 2 * r2] ; add r3, 64 ; dec r4d ; jnz .pass2 ; RET ; %undef IDCT16_SHIFT1 %undef IDCT16_SHIFT2 %undef IDCT16_ADD1 %undef IDCT16_ADD2 %macro IDCT32_PASS1 1 vbroadcasti128 m3, [tab_idct32_1+%1*32 ] ; vbroadcasti128 m13, [tab_idct32_1+%1*32+16] ; pmaddwd m9, m4, m3 ; pmaddwd m10, m8, m13 ; phaddd m9, m10 ; ; pmaddwd m10, m2, m3 ; pmaddwd m11, m1, m13 ; phaddd m10, m11 ; ; phaddd m9, m10 ; ; vbroadcasti128 m3, [tab_idct32_1+(15 - %1)*32 ] vbroadcasti128 m13, [tab_idct32_1+(15 - %1)*32+16] pmaddwd m10, m4, m3 ; pmaddwd m11, m8, m13 ; phaddd m10, m11 ; ; pmaddwd m11, m2, m3 ; pmaddwd m12, m1, m13 ; phaddd m11, m12 ; ; phaddd m10, m11 ; phaddd m9, m10 ; [row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15] ; vbroadcasti128 m3, [tab_idct32_2 + %1*16] ; pmaddwd m10, m0, m3 ; pmaddwd m11, m7, m3 ; phaddd m10, m11 ; phaddd m10, m10 ; ; vbroadcasti128 m3, [tab_idct32_3 + %1*16] ; pmaddwd m11, m5, m3 ; pmaddwd m12, m6, m3 ; phaddd m11, m12 ; phaddd m11, m11 ; ; paddd m12, m10, m11 ; [row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL] psubd m10, m11 ; [row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL] ; punpcklqdq m12, m10 ; [row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15] paddd m10, m9, m12 ; paddd m10, m15 ; psrad m10, IDCT32_SHIFT1 ; ; psubd m12, m9 ; paddd m12, m15 ; psrad m12, IDCT32_SHIFT1 ; ; packssdw m10, m12 ; vextracti128 xm12, m10, 1 ; movd [r3 + %1*64], xm10 ; movd [r3 + 32 + %1*64], xm12 ; pextrd [r4 - %1*64], xm10, 1 ; pextrd [r4 + 32 - %1*64], xm12, 1 ; pextrd [r3 + 16*64 + %1*64], xm10, 3 ; pextrd [r3 + 16*64 + 32 + %1*64], xm12, 3 ; pextrd [r4 + 16*64 - %1*64], xm10, 2 ; pextrd [r4 + 16*64 + 32 - %1*64], xm12, 2 ; %endmacro ; ---------------------------------------------------------------------------- ; void idct_32x32(const coeff_t *src, coeff_t *dst, int i_dst) ; ---------------------------------------------------------------------------- ; TODO: Reduce PHADDD instruction by PADDD ; ------------------------------------------------------------------ ; idct_32x32_avx2 INIT_YMM avx2 cglobal idct_32x32, 3, 6, 16, 0-32*64 %define IDCT32_SHIFT1 5 ; shift1 = 5 %define IDCT32_ADD1 [pd_16] ; add1 = 16 ; vbroadcasti128 m15, IDCT32_ADD1 ; add1 = 16 ; mov r3, rsp ; lea r4, [r3 + 15 * 64] ; mov r5d, 8 ; ; .pass1: ; movq xm0, [r0 + 2 * 64] ; movq xm1, [r0 + 18 * 64] ; punpcklqdq xm0, xm0, xm1 ; movq xm1, [r0 + 0 * 64] ; movq xm2, [r0 + 16 * 64] ; punpcklqdq xm1, xm1, xm2 ; vinserti128 m0, m0, xm1, 1 ; [2 18 0 16] ; movq xm1, [r0 + 1 * 64] ; movq xm2, [r0 + 9 * 64] ; punpcklqdq xm1, xm1, xm2 ; movq xm2, [r0 + 17 * 64] ; movq xm3, [r0 + 25 * 64] ; punpcklqdq xm2, xm2, xm3 ; vinserti128 m1, m1, xm2, 1 ; [1 9 17 25] ; movq xm2, [r0 + 6 * 64] ; movq xm3, [r0 + 22 * 64] ; punpcklqdq xm2, xm2, xm3 ; movq xm3, [r0 + 4 * 64] ; movq xm4, [r0 + 20 * 64] ; punpcklqdq xm3, xm3, xm4 ; vinserti128 m2, m2, xm3, 1 ; [6 22 4 20] ; movq xm3, [r0 + 3 * 64] ; movq xm4, [r0 + 11 * 64] ; punpcklqdq xm3, xm3, xm4 ; movq xm4, [r0 + 19 * 64] ; movq xm5, [r0 + 27 * 64] ; punpcklqdq xm4, xm4, xm5 ; vinserti128 m3, m3, xm4, 1 ; [3 11 17 25] ; movq xm4, [r0 + 10 * 64] ; movq xm5, [r0 + 26 * 64] ; punpcklqdq xm4, xm4, xm5 ; movq xm5, [r0 + 8 * 64] ; movq xm6, [r0 + 24 * 64] ; punpcklqdq xm5, xm5, xm6 ; vinserti128 m4, m4, xm5, 1 ; [10 26 8 24] ; movq xm5, [r0 + 5 * 64] ; movq xm6, [r0 + 13 * 64] ; punpcklqdq xm5, xm5, xm6 ; movq xm6, [r0 + 21 * 64] ; movq xm7, [r0 + 29 * 64] ; punpcklqdq xm6, xm6, xm7 ; vinserti128 m5, m5, xm6, 1 ; [5 13 21 9] ; movq xm6, [r0 + 14 * 64] ; movq xm7, [r0 + 30 * 64] ; punpcklqdq xm6, xm6, xm7 ; movq xm7, [r0 + 12 * 64] ; movq xm8, [r0 + 28 * 64] ; punpcklqdq xm7, xm7, xm8 ; vinserti128 m6, m6, xm7, 1 ; [14 30 12 28] ; movq xm7, [r0 + 7 * 64] ; movq xm8, [r0 + 15 * 64] ; punpcklqdq xm7, xm7, xm8 ; movq xm8, [r0 + 23 * 64] ; movq xm9, [r0 + 31 * 64] ; punpcklqdq xm8, xm8, xm9 ; vinserti128 m7, m7, xm8, 1 ; [7 15 23 31] ; punpckhwd m8, m0, m2 ; [18 22 16 20] punpcklwd m0, m2 ; [2 6 0 4] ; punpckhwd m2, m1, m3 ; [9 11 25 27] punpcklwd m1, m3 ; [1 3 17 19] ; punpckhwd m3, m4, m6 ; [26 30 24 28] punpcklwd m4, m6 ; [10 14 8 12] ; punpckhwd m6, m5, m7 ; [13 15 29 31] punpcklwd m5, m7 ; [5 7 21 23] ; punpckhdq m7, m0, m4 ; [22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123] punpckldq m0, m4 ; [20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121] ; punpckhdq m4, m8, m3 ; [182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283] punpckldq m8, m3 ; [180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281] ; punpckhdq m3, m1, m5 ; [12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233] punpckldq m1, m5 ; [10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231] ; punpckhdq m5, m2, m6 ; [92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313] punpckldq m2, m6 ; [90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311] ; punpckhqdq m6, m0, m8 ; [21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281] punpcklqdq m0, m8 ; [20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280] ; punpckhqdq m8, m7, m4 ; [23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283] punpcklqdq m7, m4 ; [22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282] ; punpckhqdq m4, m1, m2 ; [11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311] punpcklqdq m1, m2 ; [10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310] ; punpckhqdq m2, m3, m5 ; [13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313] punpcklqdq m3, m5 ; [12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312] ; vperm2i128 m5, m0, m6, 0x20 ; [20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301] vperm2i128 m0, m0, m6, 0x31 ; [00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281] ; vperm2i128 m6, m7, m8, 0x20 ; [22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303] vperm2i128 m7, m7, m8, 0x31 ; [02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283] ; vperm2i128 m8, m1, m4, 0x31 ; [170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311] vperm2i128 m4, m1, m4, 0x20 ; [10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151] ; vperm2i128 m1, m3, m2, 0x31 ; [172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313] vperm2i128 m2, m3, m2, 0x20 ; [12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153] ; IDCT32_PASS1 0 ; IDCT32_PASS1 1 ; IDCT32_PASS1 2 ; IDCT32_PASS1 3 ; IDCT32_PASS1 4 ; IDCT32_PASS1 5 ; IDCT32_PASS1 6 ; IDCT32_PASS1 7 ; ; add r0, 8 ; add r3, 4 ; add r4, 4 ; dec r5d ; jnz .pass1 ; ; %if BIT_DEPTH == 10 ; %define IDCT_SHIFT2 10 ; vpbroadcastd m15, [pd_512 ] ; %elif BIT_DEPTH == 8 ; for BIT_DEPTH: 8 test r2, 0x01 ; test flag? jz .b32x32 ; lea r5, [pd_11 ] ; shift2 = 11 vpbroadcastq m15, [pd_2048] ; add2 = 1024 and r2, 0xFE ; clear the flag jmp .normal_start ; .b32x32: ; lea r5, [pd_12 ] ; shift2 = 12 vpbroadcastq m15, [pd_2048] ; add2 = 2048 .normal_start: ; %else ; %error Unsupported BIT_DEPTH! ; %endif ; ; mov r3, rsp ; add r2d, r2d ; mov r4d, 32 ; ; mova m7, [tab_idct32_4 ] ; mova m8, [tab_idct32_4+ 32] ; mova m9, [tab_idct32_4+ 64] ; mova m10, [tab_idct32_4+ 96] ; mova m11, [tab_idct32_4+128] ; mova m12, [tab_idct32_4+160] ; mova m13, [tab_idct32_4+192] ; mova m14, [tab_idct32_4+224] ; .pass2: ; movu m0, [r3] ; movu m1, [r3 + 32] ; ; pmaddwd m2, m0, m7 ; pmaddwd m3, m0, m8 ; phaddd m2, m3 ; ; pmaddwd m3, m0, m9 ; pmaddwd m4, m0, m10 ; phaddd m3, m4 ; ; phaddd m2, m3 ; ; pmaddwd m3, m0, m11 ; pmaddwd m4, m0, m12 ; phaddd m3, m4 ; ; pmaddwd m4, m0, m13 ; pmaddwd m5, m0, m14 ; phaddd m4, m5 ; ; phaddd m3, m4 ; ; vperm2i128 m4, m2, m3, 0x31 ; vperm2i128 m2, m2, m3, 0x20 ; paddd m2, m4 ; ; pmaddwd m3, m0, [tab_idct32_4+256] ; pmaddwd m4, m0, [tab_idct32_4+288] ; phaddd m3, m4 ; ; pmaddwd m4, m0, [tab_idct32_4+320] ; pmaddwd m5, m0, [tab_idct32_4+352] ; phaddd m4, m5 ; ; phaddd m3, m4 ; ; pmaddwd m4, m0, [tab_idct32_4+384] ; pmaddwd m5, m0, [tab_idct32_4+416] ; phaddd m4, m5 ; ; pmaddwd m5, m0, [tab_idct32_4+448] ; pmaddwd m0, [tab_idct32_4+480] ; phaddd m5, m0 ; ; phaddd m4, m5 ; ; vperm2i128 m0, m3, m4, 0x31 ; vperm2i128 m3, m3, m4, 0x20 ; paddd m3, m0 ; ; pmaddwd m4, m1, [tab_idct32_1] ; pmaddwd m0, m1, [tab_idct32_1+32] ; phaddd m4, m0 ; ; pmaddwd m5, m1, [tab_idct32_1+ 64] ; pmaddwd m0, m1, [tab_idct32_1+ 96] ; phaddd m5, m0 ; ; phaddd m4, m5 ; ; pmaddwd m5, m1, [tab_idct32_1+128] ; pmaddwd m0, m1, [tab_idct32_1+160] ; phaddd m5, m0 ; ; pmaddwd m6, m1, [tab_idct32_1+192] ; pmaddwd m0, m1, [tab_idct32_1+224] ; phaddd m6, m0 ; ; phaddd m5, m6 ; ; vperm2i128 m0, m4, m5, 0x31 ; vperm2i128 m4, m4, m5, 0x20 ; paddd m4, m0 ; ; pmaddwd m5, m1, [tab_idct32_1+256] ; pmaddwd m0, m1, [tab_idct32_1+288] ; phaddd m5, m0 ; ; pmaddwd m6, m1, [tab_idct32_1+320] ; pmaddwd m0, m1, [tab_idct32_1+352] ; phaddd m6, m0 ; ; phaddd m5, m6 ; ; pmaddwd m6, m1, [tab_idct32_1+384] ; pmaddwd m0, m1, [tab_idct32_1+416] ; phaddd m6, m0 ; ; pmaddwd m0, m1, [tab_idct32_1+448] ; pmaddwd m1, [tab_idct32_1+480] ; phaddd m0, m1 ; ; phaddd m6, m0 ; ; vperm2i128 m0, m5, m6, 0x31 ; vperm2i128 m5, m5, m6, 0x20 ; paddd m5, m0 ; ; paddd m6, m2, m4 ; paddd m6, m15 ; psrad m6, [r5] ; shift2 ; psubd m2, m4 ; paddd m2, m15 ; psrad m2, [r5] ; shift2 ; paddd m4, m3, m5 ; paddd m4, m15 ; psrad m4, [r5] ; shift2 ; psubd m3, m5 ; paddd m3, m15 ; psrad m3, [r5] ; shift2 ; packssdw m6, m4 ; packssdw m2, m3 ; ; vpermq m6, m6, 0xD8 ; vpermq m2, m2, 0x8D ; pshufb m2, [dct16_shuf1] ; ; movu [r1 ], m6 ; movu [r1 + 32], m2 ; ; add r1, r2 ; add r3, 64 ; dec r4d ; jnz .pass2 ; RET ; %undef IDCT32_SHIFT1 %undef IDCT32_SHIFT2 %undef IDCT32_ADD1 %undef IDCT32_ADD2 %endif ; if ARCH_X86_64 == 1 davs2-1.6/source/common/x86/dct8.h000066400000000000000000000036271337322544400166460ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Nabajit Deka ;* Min Chen * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef DAVS2_I386_DCT8_H #define DAVS2_I386_DCT8_H #ifdef __cplusplus extern "C" { #endif void FPFX(idct_4x4_sse2 )(const coeff_t *src, coeff_t *dst, int i_dst); void FPFX(idct_8x8_ssse3)(const coeff_t *src, coeff_t *dst, int i_dst); #if ARCH_X86_64 void FPFX(idct_4x4_avx2 )(const coeff_t *src, coeff_t *dst, int i_dst); void FPFX(idct_8x8_sse2 )(const coeff_t *src, coeff_t *dst, int i_dst); void FPFX(idct_8x8_avx2 )(const coeff_t *src, coeff_t *dst, int i_dst); void FPFX(idct_16x16_avx2)(const coeff_t *src, coeff_t *dst, int i_dst); void FPFX(idct_32x32_avx2)(const coeff_t *src, coeff_t *dst, int i_dst); #endif #ifdef __cplusplus } #endif #endif // ifndef DAVS2_I386_DCT8_H davs2-1.6/source/common/x86/ipfilter8.asm000066400000000000000000033763601337322544400202550ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* ;* Authors: Min Chen ;* Nabajit Deka ;* Praveen Kumar Tiwari ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 const tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9 times 2 db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 dd 2, 3, 3, 4, 4, 5, 5, 6 const pb_8tap_hps_0, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 times 2 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10 times 2 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12 times 2 db 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14 const tab_Lm, db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 const tab_Vm, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 const tab_Cm, db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 const pd_526336, times 8 dd 8192*64+2048 const tab_ChromaCoeff, db 0, 64, 0, 0 db -2, 58, 10, -2 db -4, 54, 16, -2 db -6, 46, 28, -4 db -4, 36, 36, -4 db -4, 28, 46, -6 db -2, 16, 54, -4 db -2, 10, 58, -2 const tabw_ChromaCoeff, dw 0, 64, 0, 0 dw -2, 58, 10, -2 dw -4, 54, 16, -2 dw -6, 46, 28, -4 dw -4, 36, 36, -4 dw -4, 28, 46, -6 dw -2, 16, 54, -4 dw -2, 10, 58, -2 const tab_ChromaCoeff_V, times 8 db 0, 64 times 8 db 0, 0 times 8 db -2, 58 times 8 db 10, -2 times 8 db -4, 54 times 8 db 16, -2 times 8 db -6, 46 times 8 db 28, -4 times 8 db -4, 36 times 8 db 36, -4 times 8 db -4, 28 times 8 db 46, -6 times 8 db -2, 16 times 8 db 54, -4 times 8 db -2, 10 times 8 db 58, -2 const tab_ChromaCoeffV, times 4 dw 0, 64 times 4 dw 0, 0 times 4 dw -2, 58 times 4 dw 10, -2 times 4 dw -4, 54 times 4 dw 16, -2 times 4 dw -6, 46 times 4 dw 28, -4 times 4 dw -4, 36 times 4 dw 36, -4 times 4 dw -4, 28 times 4 dw 46, -6 times 4 dw -2, 16 times 4 dw 54, -4 times 4 dw -2, 10 times 4 dw 58, -2 const pw_ChromaCoeffV, times 8 dw 0, 64 times 8 dw 0, 0 times 8 dw -2, 58 times 8 dw 10, -2 times 8 dw -4, 54 times 8 dw 16, -2 times 8 dw -6, 46 times 8 dw 28, -4 times 8 dw -4, 36 times 8 dw 36, -4 times 8 dw -4, 28 times 8 dw 46, -6 times 8 dw -2, 16 times 8 dw 54, -4 times 8 dw -2, 10 times 8 dw 58, -2 const tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0 db -1, 4, -10, 58, 17, -5, 1, 0 db -1, 4, -11, 40, 40, -11, 4, -1 db 0, 1, -5, 17, 58, -10, 4, -1 const tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0 dw -1, 4, -10, 58, 17, -5, 1, 0 dw -1, 4, -11, 40, 40, -11, 4, -1 dw 0, 1, -5, 17, 58, -10, 4, -1 const tab_LumaCoeffV, times 4 dw 0, 0 times 4 dw 0, 64 times 4 dw 0, 0 times 4 dw 0, 0 times 4 dw -1, 4 times 4 dw -10, 58 times 4 dw 17, -5 times 4 dw 1, 0 times 4 dw -1, 4 times 4 dw -11, 40 times 4 dw 40, -11 times 4 dw 4, -1 times 4 dw 0, 1 times 4 dw -5, 17 times 4 dw 58, -10 times 4 dw 4, -1 const pw_LumaCoeffVer, times 8 dw 0, 0 times 8 dw 0, 64 times 8 dw 0, 0 times 8 dw 0, 0 times 8 dw -1, 4 times 8 dw -10, 58 times 8 dw 17, -5 times 8 dw 1, 0 times 8 dw -1, 4 times 8 dw -11, 40 times 8 dw 40, -11 times 8 dw 4, -1 times 8 dw 0, 1 times 8 dw -5, 17 times 8 dw 58, -10 times 8 dw 4, -1 const pb_LumaCoeffVer, times 16 db 0, 0 times 16 db 0, 64 times 16 db 0, 0 times 16 db 0, 0 times 16 db -1, 4 times 16 db -10, 58 times 16 db 17, -5 times 16 db 1, 0 times 16 db -1, 4 times 16 db -11, 40 times 16 db 40, -11 times 16 db 4, -1 times 16 db 0, 1 times 16 db -5, 17 times 16 db 58, -10 times 16 db 4, -1 const tab_LumaCoeffVer, times 8 db 0, 0 times 8 db 0, 64 times 8 db 0, 0 times 8 db 0, 0 times 8 db -1, 4 times 8 db -10, 58 times 8 db 17, -5 times 8 db 1, 0 times 8 db -1, 4 times 8 db -11, 40 times 8 db 40, -11 times 8 db 4, -1 times 8 db 0, 1 times 8 db -5, 17 times 8 db 58, -10 times 8 db 4, -1 const tab_LumaCoeffVer_32, times 16 db 0, 0 times 16 db 0, 64 times 16 db 0, 0 times 16 db 0, 0 times 16 db -1, 4 times 16 db -10, 58 times 16 db 17, -5 times 16 db 1, 0 times 16 db -1, 4 times 16 db -11, 40 times 16 db 40, -11 times 16 db 4, -1 times 16 db 0, 1 times 16 db -5, 17 times 16 db 58, -10 times 16 db 4, -1 const tab_ChromaCoeffVer_32, times 16 db 0, 64 times 16 db 0, 0 times 16 db -2, 58 times 16 db 10, -2 times 16 db -4, 54 times 16 db 16, -2 times 16 db -6, 46 times 16 db 28, -4 times 16 db -4, 36 times 16 db 36, -4 times 16 db -4, 28 times 16 db 46, -6 times 16 db -2, 16 times 16 db 54, -4 times 16 db -2, 10 times 16 db 58, -2 const tab_c_64_n64, times 8 db 64, -64 const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 const interp4_horiz_shuf1, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 const interp4_hpp_shuf, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 ALIGN 32 interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 SECTION .text cextern pb_128 cextern pw_1 cextern pw_32 cextern pw_512 cextern pw_2000 cextern pw_8192 %macro FILTER_H4_w2_2_sse2 0 pxor m3, m3 movd m0, [srcq - 1] movd m2, [srcq] punpckldq m0, m2 punpcklbw m0, m3 movd m1, [srcq + srcstrideq - 1] movd m2, [srcq + srcstrideq] punpckldq m1, m2 punpcklbw m1, m3 pmaddwd m0, m4 pmaddwd m1, m4 packssdw m0, m1 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 paddw m0, m1 psrld m0, 16 packssdw m0, m0 paddw m0, m5 psraw m0, 6 packuswb m0, m0 movd r4, m0 mov [dstq], r4w shr r4, 16 mov [dstq + dststrideq], r4w %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_H4_W2xN_sse3 1 INIT_XMM sse3 cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride mov r4d, r4m mova m5, [pw_32] %ifdef PIC lea r5, [tabw_ChromaCoeff] movddup m4, [r5 + r4 * 8] %else movddup m4, [tabw_ChromaCoeff + r4 * 8] %endif %assign x 1 %rep %1/2 FILTER_H4_w2_2_sse2 %if x < %1/2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endif %assign x x+1 %endrep RET %endmacro FILTER_H4_W2xN_sse3 4 FILTER_H4_W2xN_sse3 8 FILTER_H4_W2xN_sse3 16 %macro FILTER_H4_w4_2_sse2 0 pxor m5, m5 movd m0, [srcq - 1] movd m6, [srcq] punpckldq m0, m6 punpcklbw m0, m5 movd m1, [srcq + 1] movd m6, [srcq + 2] punpckldq m1, m6 punpcklbw m1, m5 movd m2, [srcq + srcstrideq - 1] movd m6, [srcq + srcstrideq] punpckldq m2, m6 punpcklbw m2, m5 movd m3, [srcq + srcstrideq + 1] movd m6, [srcq + srcstrideq + 2] punpckldq m3, m6 punpcklbw m3, m5 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m4 pmaddwd m3, m4 packssdw m0, m1 packssdw m2, m3 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 pshuflw m3, m2, q2301 pshufhw m3, m3, q2301 paddw m0, m1 paddw m2, m3 psrld m0, 16 psrld m2, 16 packssdw m0, m2 paddw m0, m7 psraw m0, 6 packuswb m0, m2 movd [dstq], m0 psrldq m0, 4 movd [dstq + dststrideq], m0 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_H4_W4xN_sse3 1 INIT_XMM sse3 cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride mov r4d, r4m mova m7, [pw_32] %ifdef PIC lea r5, [tabw_ChromaCoeff] movddup m4, [r5 + r4 * 8] %else movddup m4, [tabw_ChromaCoeff + r4 * 8] %endif %assign x 1 %rep %1/2 FILTER_H4_w4_2_sse2 %if x < %1/2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endif %assign x x+1 %endrep RET %endmacro FILTER_H4_W4xN_sse3 2 FILTER_H4_W4xN_sse3 4 FILTER_H4_W4xN_sse3 8 FILTER_H4_W4xN_sse3 16 FILTER_H4_W4xN_sse3 32 %macro FILTER_H4_w6_sse2 0 pxor m4, m4 movh m0, [srcq - 1] movh m5, [srcq] punpckldq m0, m5 movhlps m2, m0 punpcklbw m0, m4 punpcklbw m2, m4 movd m1, [srcq + 1] movd m5, [srcq + 2] punpckldq m1, m5 punpcklbw m1, m4 pmaddwd m0, m6 pmaddwd m1, m6 pmaddwd m2, m6 packssdw m0, m1 packssdw m2, m2 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 pshuflw m3, m2, q2301 paddw m0, m1 paddw m2, m3 psrld m0, 16 psrld m2, 16 packssdw m0, m2 paddw m0, m7 psraw m0, 6 packuswb m0, m0 movd [dstq], m0 pextrw r4d, m0, 2 mov [dstq + 4], r4w %endmacro %macro FILH4W8_sse2 1 movh m0, [srcq - 1 + %1] movh m5, [srcq + %1] punpckldq m0, m5 movhlps m2, m0 punpcklbw m0, m4 punpcklbw m2, m4 movh m1, [srcq + 1 + %1] movh m5, [srcq + 2 + %1] punpckldq m1, m5 movhlps m3, m1 punpcklbw m1, m4 punpcklbw m3, m4 pmaddwd m0, m6 pmaddwd m1, m6 pmaddwd m2, m6 pmaddwd m3, m6 packssdw m0, m1 packssdw m2, m3 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 pshuflw m3, m2, q2301 pshufhw m3, m3, q2301 paddw m0, m1 paddw m2, m3 psrld m0, 16 psrld m2, 16 packssdw m0, m2 paddw m0, m7 psraw m0, 6 packuswb m0, m0 movh [dstq + %1], m0 %endmacro %macro FILTER_H4_w8_sse2 0 FILH4W8_sse2 0 %endmacro %macro FILTER_H4_w12_sse2 0 FILH4W8_sse2 0 movd m1, [srcq - 1 + 8] movd m3, [srcq + 8] punpckldq m1, m3 punpcklbw m1, m4 movd m2, [srcq + 1 + 8] movd m3, [srcq + 2 + 8] punpckldq m2, m3 punpcklbw m2, m4 pmaddwd m1, m6 pmaddwd m2, m6 packssdw m1, m2 pshuflw m2, m1, q2301 pshufhw m2, m2, q2301 paddw m1, m2 psrld m1, 16 packssdw m1, m1 paddw m1, m7 psraw m1, 6 packuswb m1, m1 movd [dstq + 8], m1 %endmacro %macro FILTER_H4_w16_sse2 0 FILH4W8_sse2 0 FILH4W8_sse2 8 %endmacro %macro FILTER_H4_w24_sse2 0 FILH4W8_sse2 0 FILH4W8_sse2 8 FILH4W8_sse2 16 %endmacro %macro FILTER_H4_w32_sse2 0 FILH4W8_sse2 0 FILH4W8_sse2 8 FILH4W8_sse2 16 FILH4W8_sse2 24 %endmacro %macro FILTER_H4_w48_sse2 0 FILH4W8_sse2 0 FILH4W8_sse2 8 FILH4W8_sse2 16 FILH4W8_sse2 24 FILH4W8_sse2 32 FILH4W8_sse2 40 %endmacro %macro FILTER_H4_w64_sse2 0 FILH4W8_sse2 0 FILH4W8_sse2 8 FILH4W8_sse2 16 FILH4W8_sse2 24 FILH4W8_sse2 32 FILH4W8_sse2 40 FILH4W8_sse2 48 FILH4W8_sse2 56 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro IPFILTER_CHROMA_sse3 2 INIT_XMM sse3 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride mov r4d, r4m mova m7, [pw_32] pxor m4, m4 %ifdef PIC lea r5, [tabw_ChromaCoeff] movddup m6, [r5 + r4 * 8] %else movddup m6, [tabw_ChromaCoeff + r4 * 8] %endif %assign x 1 %rep %2 FILTER_H4_w%1_sse2 %if x < %2 add srcq, srcstrideq add dstq, dststrideq %endif %assign x x+1 %endrep RET %endmacro IPFILTER_CHROMA_sse3 6, 8 IPFILTER_CHROMA_sse3 8, 2 IPFILTER_CHROMA_sse3 8, 4 IPFILTER_CHROMA_sse3 8, 6 IPFILTER_CHROMA_sse3 8, 8 IPFILTER_CHROMA_sse3 8, 16 IPFILTER_CHROMA_sse3 8, 32 IPFILTER_CHROMA_sse3 12, 16 IPFILTER_CHROMA_sse3 6, 16 IPFILTER_CHROMA_sse3 8, 12 IPFILTER_CHROMA_sse3 8, 64 IPFILTER_CHROMA_sse3 12, 32 IPFILTER_CHROMA_sse3 16, 4 IPFILTER_CHROMA_sse3 16, 8 IPFILTER_CHROMA_sse3 16, 12 IPFILTER_CHROMA_sse3 16, 16 IPFILTER_CHROMA_sse3 16, 32 IPFILTER_CHROMA_sse3 32, 8 IPFILTER_CHROMA_sse3 32, 16 IPFILTER_CHROMA_sse3 32, 24 IPFILTER_CHROMA_sse3 24, 32 IPFILTER_CHROMA_sse3 32, 32 IPFILTER_CHROMA_sse3 16, 24 IPFILTER_CHROMA_sse3 16, 64 IPFILTER_CHROMA_sse3 32, 48 IPFILTER_CHROMA_sse3 24, 64 IPFILTER_CHROMA_sse3 32, 64 IPFILTER_CHROMA_sse3 64, 64 IPFILTER_CHROMA_sse3 64, 32 IPFILTER_CHROMA_sse3 64, 48 IPFILTER_CHROMA_sse3 48, 64 IPFILTER_CHROMA_sse3 64, 16 %macro FILTER_2 2 movd m3, [srcq + %1] movd m4, [srcq + 1 + %1] punpckldq m3, m4 punpcklbw m3, m0 pmaddwd m3, m1 packssdw m3, m3 pshuflw m4, m3, q2301 paddw m3, m4 psrldq m3, 2 psubw m3, m2 movd [dstq + %2], m3 %endmacro %macro FILTER_4 2 movd m3, [srcq + %1] movd m4, [srcq + 1 + %1] punpckldq m3, m4 punpcklbw m3, m0 pmaddwd m3, m1 movd m4, [srcq + 2 + %1] movd m5, [srcq + 3 + %1] punpckldq m4, m5 punpcklbw m4, m0 pmaddwd m4, m1 packssdw m3, m4 pshuflw m4, m3, q2301 pshufhw m4, m4, q2301 paddw m3, m4 psrldq m3, 2 pshufd m3, m3, q3120 psubw m3, m2 movh [dstq + %2], m3 %endmacro %macro FILTER_4TAP_HPS_sse3 2 INIT_XMM sse3 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride mov r4d, r4m add dststrided, dststrided mova m2, [pw_2000] pxor m0, m0 %ifdef PIC lea r6, [tabw_ChromaCoeff] movddup m1, [r6 + r4 * 8] %else movddup m1, [tabw_ChromaCoeff + r4 * 8] %endif mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: %assign x -1 %assign y 0 %rep %1/4 FILTER_4 x,y %assign x x+4 %assign y y+8 %endrep %rep (%1 % 4)/2 FILTER_2 x,y %endrep add srcq, srcstrideq add dstq, dststrideq dec r4d jnz .loopH RET %endmacro FILTER_4TAP_HPS_sse3 2, 4 FILTER_4TAP_HPS_sse3 2, 8 FILTER_4TAP_HPS_sse3 2, 16 FILTER_4TAP_HPS_sse3 4, 2 FILTER_4TAP_HPS_sse3 4, 4 FILTER_4TAP_HPS_sse3 4, 8 FILTER_4TAP_HPS_sse3 4, 16 FILTER_4TAP_HPS_sse3 4, 32 FILTER_4TAP_HPS_sse3 6, 8 FILTER_4TAP_HPS_sse3 6, 16 FILTER_4TAP_HPS_sse3 8, 2 FILTER_4TAP_HPS_sse3 8, 4 FILTER_4TAP_HPS_sse3 8, 6 FILTER_4TAP_HPS_sse3 8, 8 FILTER_4TAP_HPS_sse3 8, 12 FILTER_4TAP_HPS_sse3 8, 16 FILTER_4TAP_HPS_sse3 8, 32 FILTER_4TAP_HPS_sse3 8, 64 FILTER_4TAP_HPS_sse3 12, 16 FILTER_4TAP_HPS_sse3 12, 32 FILTER_4TAP_HPS_sse3 16, 4 FILTER_4TAP_HPS_sse3 16, 8 FILTER_4TAP_HPS_sse3 16, 12 FILTER_4TAP_HPS_sse3 16, 16 FILTER_4TAP_HPS_sse3 16, 24 FILTER_4TAP_HPS_sse3 16, 32 FILTER_4TAP_HPS_sse3 16, 64 FILTER_4TAP_HPS_sse3 24, 32 FILTER_4TAP_HPS_sse3 24, 64 FILTER_4TAP_HPS_sse3 32, 8 FILTER_4TAP_HPS_sse3 32, 16 FILTER_4TAP_HPS_sse3 32, 24 FILTER_4TAP_HPS_sse3 32, 32 FILTER_4TAP_HPS_sse3 32, 48 FILTER_4TAP_HPS_sse3 32, 64 FILTER_4TAP_HPS_sse3 48, 64 FILTER_4TAP_HPS_sse3 64, 16 FILTER_4TAP_HPS_sse3 64, 32 FILTER_4TAP_HPS_sse3 64, 48 FILTER_4TAP_HPS_sse3 64, 64 %macro FILTER_H8_W8_sse2 0 movh m1, [r0 + x - 3] movh m4, [r0 + x - 2] punpcklbw m1, m6 punpcklbw m4, m6 movh m5, [r0 + x - 1] movh m0, [r0 + x] punpcklbw m5, m6 punpcklbw m0, m6 pmaddwd m1, m3 pmaddwd m4, m3 pmaddwd m5, m3 pmaddwd m0, m3 packssdw m1, m4 packssdw m5, m0 pshuflw m4, m1, q2301 pshufhw m4, m4, q2301 pshuflw m0, m5, q2301 pshufhw m0, m0, q2301 paddw m1, m4 paddw m5, m0 psrldq m1, 2 psrldq m5, 2 pshufd m1, m1, q3120 pshufd m5, m5, q3120 punpcklqdq m1, m5 movh m7, [r0 + x + 1] movh m4, [r0 + x + 2] punpcklbw m7, m6 punpcklbw m4, m6 movh m5, [r0 + x + 3] movh m0, [r0 + x + 4] punpcklbw m5, m6 punpcklbw m0, m6 pmaddwd m7, m3 pmaddwd m4, m3 pmaddwd m5, m3 pmaddwd m0, m3 packssdw m7, m4 packssdw m5, m0 pshuflw m4, m7, q2301 pshufhw m4, m4, q2301 pshuflw m0, m5, q2301 pshufhw m0, m0, q2301 paddw m7, m4 paddw m5, m0 psrldq m7, 2 psrldq m5, 2 pshufd m7, m7, q3120 pshufd m5, m5, q3120 punpcklqdq m7, m5 pshuflw m4, m1, q2301 pshufhw m4, m4, q2301 pshuflw m0, m7, q2301 pshufhw m0, m0, q2301 paddw m1, m4 paddw m7, m0 psrldq m1, 2 psrldq m7, 2 pshufd m1, m1, q3120 pshufd m7, m7, q3120 punpcklqdq m1, m7 %endmacro %macro FILTER_H8_W4_sse2 0 movh m1, [r0 + x - 3] movh m0, [r0 + x - 2] punpcklbw m1, m6 punpcklbw m0, m6 movh m4, [r0 + x - 1] movh m5, [r0 + x] punpcklbw m4, m6 punpcklbw m5, m6 pmaddwd m1, m3 pmaddwd m0, m3 pmaddwd m4, m3 pmaddwd m5, m3 packssdw m1, m0 packssdw m4, m5 pshuflw m0, m1, q2301 pshufhw m0, m0, q2301 pshuflw m5, m4, q2301 pshufhw m5, m5, q2301 paddw m1, m0 paddw m4, m5 psrldq m1, 2 psrldq m4, 2 pshufd m1, m1, q3120 pshufd m4, m4, q3120 punpcklqdq m1, m4 pshuflw m0, m1, q2301 pshufhw m0, m0, q2301 paddw m1, m0 psrldq m1, 2 pshufd m1, m1, q3120 %endmacro ;---------------------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;---------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_LUMA_sse2 3 INIT_XMM sse2 cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8 mov r4d, r4m add r4d, r4d pxor m6, m6 %ifidn %3, ps add r3d, r3d cmp r5m, byte 0 %endif %ifdef PIC lea r5, [tabw_LumaCoeff] movu m3, [r5 + r4 * 8] %else movu m3, [tabw_LumaCoeff + r4 * 8] %endif mov r4d, %2 %ifidn %3, pp mova m2, [pw_32] %else mova m2, [pw_2000] je .loopH lea r5, [r1 + 2 * r1] sub r0, r5 add r4d, 7 %endif .loopH: %assign x 0 %rep %1 / 8 FILTER_H8_W8_sse2 %ifidn %3, pp paddw m1, m2 psraw m1, 6 packuswb m1, m1 movh [r2 + x], m1 %else psubw m1, m2 movu [r2 + 2 * x], m1 %endif %assign x x+8 %endrep %rep (%1 % 8) / 4 FILTER_H8_W4_sse2 %ifidn %3, pp paddw m1, m2 psraw m1, 6 packuswb m1, m1 movd [r2 + x], m1 %else psubw m1, m2 movh [r2 + 2 * x], m1 %endif %endrep add r0, r1 add r2, r3 dec r4d jnz .loopH RET %endmacro ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- IPFILTER_LUMA_sse2 4, 4, pp IPFILTER_LUMA_sse2 4, 8, pp IPFILTER_LUMA_sse2 8, 4, pp IPFILTER_LUMA_sse2 8, 8, pp IPFILTER_LUMA_sse2 16, 16, pp IPFILTER_LUMA_sse2 16, 8, pp IPFILTER_LUMA_sse2 8, 16, pp IPFILTER_LUMA_sse2 16, 12, pp IPFILTER_LUMA_sse2 12, 16, pp IPFILTER_LUMA_sse2 16, 4, pp IPFILTER_LUMA_sse2 4, 16, pp IPFILTER_LUMA_sse2 32, 32, pp IPFILTER_LUMA_sse2 32, 16, pp IPFILTER_LUMA_sse2 16, 32, pp IPFILTER_LUMA_sse2 32, 24, pp IPFILTER_LUMA_sse2 24, 32, pp IPFILTER_LUMA_sse2 32, 8, pp IPFILTER_LUMA_sse2 8, 32, pp IPFILTER_LUMA_sse2 64, 64, pp IPFILTER_LUMA_sse2 64, 32, pp IPFILTER_LUMA_sse2 32, 64, pp IPFILTER_LUMA_sse2 64, 48, pp IPFILTER_LUMA_sse2 48, 64, pp IPFILTER_LUMA_sse2 64, 16, pp IPFILTER_LUMA_sse2 16, 64, pp ;---------------------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;---------------------------------------------------------------------------------------------------------------------------- IPFILTER_LUMA_sse2 4, 4, ps IPFILTER_LUMA_sse2 8, 8, ps IPFILTER_LUMA_sse2 8, 4, ps IPFILTER_LUMA_sse2 4, 8, ps IPFILTER_LUMA_sse2 16, 16, ps IPFILTER_LUMA_sse2 16, 8, ps IPFILTER_LUMA_sse2 8, 16, ps IPFILTER_LUMA_sse2 16, 12, ps IPFILTER_LUMA_sse2 12, 16, ps IPFILTER_LUMA_sse2 16, 4, ps IPFILTER_LUMA_sse2 4, 16, ps IPFILTER_LUMA_sse2 32, 32, ps IPFILTER_LUMA_sse2 32, 16, ps IPFILTER_LUMA_sse2 16, 32, ps IPFILTER_LUMA_sse2 32, 24, ps IPFILTER_LUMA_sse2 24, 32, ps IPFILTER_LUMA_sse2 32, 8, ps IPFILTER_LUMA_sse2 8, 32, ps IPFILTER_LUMA_sse2 64, 64, ps IPFILTER_LUMA_sse2 64, 32, ps IPFILTER_LUMA_sse2 32, 64, ps IPFILTER_LUMA_sse2 64, 48, ps IPFILTER_LUMA_sse2 48, 64, ps IPFILTER_LUMA_sse2 64, 16, ps IPFILTER_LUMA_sse2 16, 64, ps %macro PROCESS_LUMA_W4_4R_sse2 0 movd m2, [r0] movd m7, [r0 + r1] punpcklbw m2, m7 ; m2=[0 1] lea r0, [r0 + 2 * r1] movd m3, [r0] punpcklbw m7, m3 ; m7=[1 2] punpcklbw m2, m0 punpcklbw m7, m0 pmaddwd m2, [r6 + 0 * 32] pmaddwd m7, [r6 + 0 * 32] packssdw m2, m7 ; m2=[0+1 1+2] movd m7, [r0 + r1] punpcklbw m3, m7 ; m3=[2 3] lea r0, [r0 + 2 * r1] movd m5, [r0] punpcklbw m7, m5 ; m7=[3 4] punpcklbw m3, m0 punpcklbw m7, m0 pmaddwd m4, m3, [r6 + 1 * 32] pmaddwd m6, m7, [r6 + 1 * 32] packssdw m4, m6 ; m4=[2+3 3+4] paddw m2, m4 ; m2=[0+1+2+3 1+2+3+4] Row1-2 pmaddwd m3, [r6 + 0 * 32] pmaddwd m7, [r6 + 0 * 32] packssdw m3, m7 ; m3=[2+3 3+4] Row3-4 movd m7, [r0 + r1] punpcklbw m5, m7 ; m5=[4 5] lea r0, [r0 + 2 * r1] movd m4, [r0] punpcklbw m7, m4 ; m7=[5 6] punpcklbw m5, m0 punpcklbw m7, m0 pmaddwd m6, m5, [r6 + 2 * 32] pmaddwd m8, m7, [r6 + 2 * 32] packssdw m6, m8 ; m6=[4+5 5+6] paddw m2, m6 ; m2=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 pmaddwd m5, [r6 + 1 * 32] pmaddwd m7, [r6 + 1 * 32] packssdw m5, m7 ; m5=[4+5 5+6] paddw m3, m5 ; m3=[2+3+4+5 3+4+5+6] Row3-4 movd m7, [r0 + r1] punpcklbw m4, m7 ; m4=[6 7] lea r0, [r0 + 2 * r1] movd m5, [r0] punpcklbw m7, m5 ; m7=[7 8] punpcklbw m4, m0 punpcklbw m7, m0 pmaddwd m6, m4, [r6 + 3 * 32] pmaddwd m8, m7, [r6 + 3 * 32] packssdw m6, m8 ; m7=[6+7 7+8] paddw m2, m6 ; m2=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end pmaddwd m4, [r6 + 2 * 32] pmaddwd m7, [r6 + 2 * 32] packssdw m4, m7 ; m4=[6+7 7+8] paddw m3, m4 ; m3=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 movd m7, [r0 + r1] punpcklbw m5, m7 ; m5=[8 9] movd m4, [r0 + 2 * r1] punpcklbw m7, m4 ; m7=[9 10] punpcklbw m5, m0 punpcklbw m7, m0 pmaddwd m5, [r6 + 3 * 32] pmaddwd m7, [r6 + 3 * 32] packssdw m5, m7 ; m5=[8+9 9+10] paddw m3, m5 ; m3=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end %endmacro %macro PROCESS_LUMA_W8_4R_sse2 0 movq m7, [r0] movq m6, [r0 + r1] punpcklbw m7, m6 punpcklbw m2, m7, m0 punpckhbw m7, m0 pmaddwd m2, [r6 + 0 * 32] pmaddwd m7, [r6 + 0 * 32] packssdw m2, m7 ; m2=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m7, [r0] punpcklbw m6, m7 punpcklbw m3, m6, m0 punpckhbw m6, m0 pmaddwd m3, [r6 + 0 * 32] pmaddwd m6, [r6 + 0 * 32] packssdw m3, m6 ; m3=[1+2] Row2 movq m6, [r0 + r1] punpcklbw m7, m6 punpckhbw m8, m7, m0 punpcklbw m7, m0 pmaddwd m4, m7, [r6 + 0 * 32] pmaddwd m9, m8, [r6 + 0 * 32] packssdw m4, m9 ; m4=[2+3] Row3 pmaddwd m7, [r6 + 1 * 32] pmaddwd m8, [r6 + 1 * 32] packssdw m7, m8 paddw m2, m7 ; m2=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m10, [r0] punpcklbw m6, m10 punpckhbw m8, m6, m0 punpcklbw m6, m0 pmaddwd m5, m6, [r6 + 0 * 32] pmaddwd m9, m8, [r6 + 0 * 32] packssdw m5, m9 ; m5=[3+4] Row4 pmaddwd m6, [r6 + 1 * 32] pmaddwd m8, [r6 + 1 * 32] packssdw m6, m8 paddw m3, m6 ; m3 = [1+2+3+4] Row2 movq m6, [r0 + r1] punpcklbw m10, m6 punpckhbw m8, m10, m0 punpcklbw m10, m0 pmaddwd m7, m10, [r6 + 1 * 32] pmaddwd m9, m8, [r6 + 1 * 32] packssdw m7, m9 pmaddwd m10, [r6 + 2 * 32] pmaddwd m8, [r6 + 2 * 32] packssdw m10, m8 paddw m2, m10 ; m2=[0+1+2+3+4+5] Row1 paddw m4, m7 ; m4=[2+3+4+5] Row3 lea r0, [r0 + 2 * r1] movq m10, [r0] punpcklbw m6, m10 punpckhbw m8, m6, m0 punpcklbw m6, m0 pmaddwd m7, m6, [r6 + 1 * 32] pmaddwd m9, m8, [r6 + 1 * 32] packssdw m7, m9 pmaddwd m6, [r6 + 2 * 32] pmaddwd m8, [r6 + 2 * 32] packssdw m6, m8 paddw m3, m6 ; m3=[1+2+3+4+5+6] Row2 paddw m5, m7 ; m5=[3+4+5+6] Row4 movq m6, [r0 + r1] punpcklbw m10, m6 punpckhbw m8, m10, m0 punpcklbw m10, m0 pmaddwd m7, m10, [r6 + 2 * 32] pmaddwd m9, m8, [r6 + 2 * 32] packssdw m7, m9 pmaddwd m10, [r6 + 3 * 32] pmaddwd m8, [r6 + 3 * 32] packssdw m10, m8 paddw m2, m10 ; m2=[0+1+2+3+4+5+6+7] Row1 end paddw m4, m7 ; m4=[2+3+4+5+6+7] Row3 lea r0, [r0 + 2 * r1] movq m10, [r0] punpcklbw m6, m10 punpckhbw m8, m6, m0 punpcklbw m6, m0 pmaddwd m7, m6, [r6 + 2 * 32] pmaddwd m9, m8, [r6 + 2 * 32] packssdw m7, m9 pmaddwd m6, [r6 + 3 * 32] pmaddwd m8, [r6 + 3 * 32] packssdw m6, m8 paddw m3, m6 ; m3=[1+2+3+4+5+6+7+8] Row2 end paddw m5, m7 ; m5=[3+4+5+6+7+8] Row4 movq m6, [r0 + r1] punpcklbw m10, m6 punpckhbw m8, m10, m0 punpcklbw m10, m0 pmaddwd m8, [r6 + 3 * 32] pmaddwd m10, [r6 + 3 * 32] packssdw m10, m8 paddw m4, m10 ; m4=[2+3+4+5+6+7+8+9] Row3 end movq m10, [r0 + 2 * r1] punpcklbw m6, m10 punpckhbw m8, m6, m0 punpcklbw m6, m0 pmaddwd m8, [r6 + 3 * 32] pmaddwd m6, [r6 + 3 * 32] packssdw m6, m8 paddw m5, m6 ; m5=[3+4+5+6+7+8+9+10] Row4 end %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_sse2 3 INIT_XMM sse2 cglobal interp_8tap_vert_%3_%1x%2, 5, 8, 11 lea r5, [3 * r1] sub r0, r5 shl r4d, 7 %ifdef PIC lea r6, [pw_LumaCoeffVer] add r6, r4 %else lea r6, [pw_LumaCoeffVer + r4] %endif %ifidn %3,pp mova m1, [pw_32] %else mova m1, [pw_2000] add r3d, r3d %endif mov r4d, %2/4 lea r5, [3 * r3] pxor m0, m0 .loopH: %assign x 0 %rep (%1 / 8) PROCESS_LUMA_W8_4R_sse2 %ifidn %3,pp paddw m2, m1 paddw m3, m1 paddw m4, m1 paddw m5, m1 psraw m2, 6 psraw m3, 6 psraw m4, 6 psraw m5, 6 packuswb m2, m3 packuswb m4, m5 movh [r2 + x], m2 movhps [r2 + r3 + x], m2 movh [r2 + 2 * r3 + x], m4 movhps [r2 + r5 + x], m4 %else psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 movu [r2 + (2*x)], m2 movu [r2 + r3 + (2*x)], m3 movu [r2 + 2 * r3 + (2*x)], m4 movu [r2 + r5 + (2*x)], m5 %endif %assign x x+8 %if %1 > 8 lea r7, [8 * r1 - 8] sub r0, r7 %endif %endrep %rep (%1 % 8)/4 PROCESS_LUMA_W4_4R_sse2 %ifidn %3,pp paddw m2, m1 psraw m2, 6 paddw m3, m1 psraw m3, 6 packuswb m2, m3 movd [r2 + x], m2 psrldq m2, 4 movd [r2 + r3 + x], m2 psrldq m2, 4 movd [r2 + 2 * r3 + x], m2 psrldq m2, 4 movd [r2 + r5 + x], m2 %else psubw m2, m1 psubw m3, m1 movh [r2 + (2*x)], m2 movhps [r2 + r3 + (2*x)], m2 movh [r2 + 2 * r3 + (2*x)], m3 movhps [r2 + r5 + (2*x)], m3 %endif %endrep lea r2, [r2 + 4 * r3] %if %1 <= 8 lea r7, [4 * r1] sub r0, r7 %elif %1 == 12 lea r7, [4 * r1 + 8] sub r0, r7 %else lea r0, [r0 + 4 * r1 - %1] %endif dec r4d jnz .loopH RET %endmacro %if ARCH_X86_64 FILTER_VER_LUMA_sse2 4, 4, pp FILTER_VER_LUMA_sse2 4, 8, pp FILTER_VER_LUMA_sse2 4, 16, pp FILTER_VER_LUMA_sse2 8, 4, pp FILTER_VER_LUMA_sse2 8, 8, pp FILTER_VER_LUMA_sse2 8, 16, pp FILTER_VER_LUMA_sse2 8, 32, pp FILTER_VER_LUMA_sse2 12, 16, pp FILTER_VER_LUMA_sse2 16, 4, pp FILTER_VER_LUMA_sse2 16, 8, pp FILTER_VER_LUMA_sse2 16, 12, pp FILTER_VER_LUMA_sse2 16, 16, pp FILTER_VER_LUMA_sse2 16, 32, pp FILTER_VER_LUMA_sse2 16, 64, pp FILTER_VER_LUMA_sse2 24, 32, pp FILTER_VER_LUMA_sse2 32, 8, pp FILTER_VER_LUMA_sse2 32, 16, pp FILTER_VER_LUMA_sse2 32, 24, pp FILTER_VER_LUMA_sse2 32, 32, pp FILTER_VER_LUMA_sse2 32, 64, pp FILTER_VER_LUMA_sse2 48, 64, pp FILTER_VER_LUMA_sse2 64, 16, pp FILTER_VER_LUMA_sse2 64, 32, pp FILTER_VER_LUMA_sse2 64, 48, pp FILTER_VER_LUMA_sse2 64, 64, pp FILTER_VER_LUMA_sse2 4, 4, ps FILTER_VER_LUMA_sse2 4, 8, ps FILTER_VER_LUMA_sse2 4, 16, ps FILTER_VER_LUMA_sse2 8, 4, ps FILTER_VER_LUMA_sse2 8, 8, ps FILTER_VER_LUMA_sse2 8, 16, ps FILTER_VER_LUMA_sse2 8, 32, ps FILTER_VER_LUMA_sse2 12, 16, ps FILTER_VER_LUMA_sse2 16, 4, ps FILTER_VER_LUMA_sse2 16, 8, ps FILTER_VER_LUMA_sse2 16, 12, ps FILTER_VER_LUMA_sse2 16, 16, ps FILTER_VER_LUMA_sse2 16, 32, ps FILTER_VER_LUMA_sse2 16, 64, ps FILTER_VER_LUMA_sse2 24, 32, ps FILTER_VER_LUMA_sse2 32, 8, ps FILTER_VER_LUMA_sse2 32, 16, ps FILTER_VER_LUMA_sse2 32, 24, ps FILTER_VER_LUMA_sse2 32, 32, ps FILTER_VER_LUMA_sse2 32, 64, ps FILTER_VER_LUMA_sse2 48, 64, ps FILTER_VER_LUMA_sse2 64, 16, ps FILTER_VER_LUMA_sse2 64, 32, ps FILTER_VER_LUMA_sse2 64, 48, ps FILTER_VER_LUMA_sse2 64, 64, ps %endif %macro WORD_TO_DOUBLE 1 %if ARCH_X86_64 punpcklbw %1, m8 %else punpcklbw %1, %1 psrlw %1, 8 %endif %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_2x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W2_H4_sse2 2 INIT_XMM sse2 %if ARCH_X86_64 cglobal interp_4tap_vert_%1_2x%2, 4, 6, 9 pxor m8, m8 %else cglobal interp_4tap_vert_%1_2x%2, 4, 6, 8 %endif mov r4d, r4m sub r0, r1 %ifidn %1,pp mova m1, [pw_32] %elifidn %1,ps mova m1, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tabw_ChromaCoeff] movh m0, [r5 + r4 * 8] %else movh m0, [tabw_ChromaCoeff + r4 * 8] %endif punpcklqdq m0, m0 lea r5, [3 * r1] %assign x 1 %rep %2/4 movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklwd m2, m6 WORD_TO_DOUBLE m2 pmaddwd m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklwd m3, m7 WORD_TO_DOUBLE m3 pmaddwd m3, m0 packssdw m2, m3 pshuflw m3, m2, q2301 pshufhw m3, m3, q2301 paddw m2, m3 movd m7, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklwd m4, m3 WORD_TO_DOUBLE m4 pmaddwd m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklwd m5, m7 WORD_TO_DOUBLE m5 pmaddwd m5, m0 packssdw m4, m5 pshuflw m5, m4, q2301 pshufhw m5, m5, q2301 paddw m4, m5 %ifidn %1,pp psrld m2, 16 psrld m4, 16 packssdw m2, m4 paddw m2, m1 psraw m2, 6 packuswb m2, m2 %if ARCH_X86_64 movq r4, m2 mov [r2], r4w shr r4, 16 mov [r2 + r3], r4w lea r2, [r2 + 2 * r3] shr r4, 16 mov [r2], r4w shr r4, 16 mov [r2 + r3], r4w %else movd r4, m2 mov [r2], r4w shr r4, 16 mov [r2 + r3], r4w lea r2, [r2 + 2 * r3] psrldq m2, 4 movd r4, m2 mov [r2], r4w shr r4, 16 mov [r2 + r3], r4w %endif %elifidn %1,ps psrldq m2, 2 psrldq m4, 2 pshufd m2, m2, q3120 pshufd m4, m4, q3120 psubw m4, m1 psubw m2, m1 movd [r2], m2 psrldq m2, 4 movd [r2 + r3], m2 lea r2, [r2 + 2 * r3] movd [r2], m4 psrldq m4, 4 movd [r2 + r3], m4 %endif %if x < %2/4 lea r2, [r2 + 2 * r3] %endif %assign x x+1 %endrep RET %endmacro FILTER_V4_W2_H4_sse2 pp, 4 FILTER_V4_W2_H4_sse2 pp, 8 FILTER_V4_W2_H4_sse2 pp, 16 FILTER_V4_W2_H4_sse2 ps, 4 FILTER_V4_W2_H4_sse2 ps, 8 FILTER_V4_W2_H4_sse2 ps, 16 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V2_W4_H4_sse2 1 INIT_XMM sse2 cglobal interp_4tap_vert_%1_4x2, 4, 6, 8 mov r4d, r4m sub r0, r1 pxor m7, m7 %ifdef PIC lea r5, [tabw_ChromaCoeff] movh m0, [r5 + r4 * 8] %else movh m0, [tabw_ChromaCoeff + r4 * 8] %endif lea r5, [r0 + 2 * r1] punpcklqdq m0, m0 movd m2, [r0] movd m3, [r0 + r1] movd m4, [r5] movd m5, [r5 + r1] punpcklbw m2, m3 punpcklbw m1, m4, m5 punpcklwd m2, m1 movhlps m6, m2 punpcklbw m2, m7 punpcklbw m6, m7 pmaddwd m2, m0 pmaddwd m6, m0 packssdw m2, m6 movd m1, [r0 + 4 * r1] punpcklbw m3, m4 punpcklbw m5, m1 punpcklwd m3, m5 movhlps m6, m3 punpcklbw m3, m7 punpcklbw m6, m7 pmaddwd m3, m0 pmaddwd m6, m0 packssdw m3, m6 pshuflw m4, m2, q2301 pshufhw m4, m4, q2301 paddw m2, m4 pshuflw m5, m3, q2301 pshufhw m5, m5, q2301 paddw m3, m5 %ifidn %1, pp psrld m2, 16 psrld m3, 16 packssdw m2, m3 paddw m2, [pw_32] psraw m2, 6 packuswb m2, m2 movd [r2], m2 psrldq m2, 4 movd [r2 + r3], m2 %elifidn %1, ps psrldq m2, 2 psrldq m3, 2 pshufd m2, m2, q3120 pshufd m3, m3, q3120 punpcklqdq m2, m3 add r3d, r3d psubw m2, [pw_2000] movh [r2], m2 movhps [r2 + r3], m2 %endif RET %endmacro FILTER_V2_W4_H4_sse2 pp FILTER_V2_W4_H4_sse2 ps ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W4_H4_sse2 2 INIT_XMM sse2 %if ARCH_X86_64 cglobal interp_4tap_vert_%1_4x%2, 4, 6, 9 pxor m8, m8 %else cglobal interp_4tap_vert_%1_4x%2, 4, 6, 8 %endif mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tabw_ChromaCoeff] movh m0, [r5 + r4 * 8] %else movh m0, [tabw_ChromaCoeff + r4 * 8] %endif %ifidn %1,pp mova m1, [pw_32] %elifidn %1,ps add r3d, r3d mova m1, [pw_2000] %endif lea r5, [3 * r1] lea r4, [3 * r3] punpcklqdq m0, m0 %assign x 1 %rep %2/4 movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklwd m2, m6 movhlps m6, m2 WORD_TO_DOUBLE m2 WORD_TO_DOUBLE m6 pmaddwd m2, m0 pmaddwd m6, m0 packssdw m2, m6 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklwd m3, m7 movhlps m7, m3 WORD_TO_DOUBLE m3 WORD_TO_DOUBLE m7 pmaddwd m3, m0 pmaddwd m7, m0 packssdw m3, m7 pshuflw m7, m2, q2301 pshufhw m7, m7, q2301 paddw m2, m7 pshuflw m7, m3, q2301 pshufhw m7, m7, q2301 paddw m3, m7 %ifidn %1,pp psrld m2, 16 psrld m3, 16 packssdw m2, m3 paddw m2, m1 psraw m2, 6 %elifidn %1,ps psrldq m2, 2 psrldq m3, 2 pshufd m2, m2, q3120 pshufd m3, m3, q3120 punpcklqdq m2, m3 psubw m2, m1 movh [r2], m2 movhps [r2 + r3], m2 %endif movd m7, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklwd m4, m3 movhlps m3, m4 WORD_TO_DOUBLE m4 WORD_TO_DOUBLE m3 pmaddwd m4, m0 pmaddwd m3, m0 packssdw m4, m3 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklwd m5, m7 movhlps m3, m5 WORD_TO_DOUBLE m5 WORD_TO_DOUBLE m3 pmaddwd m5, m0 pmaddwd m3, m0 packssdw m5, m3 pshuflw m7, m4, q2301 pshufhw m7, m7, q2301 paddw m4, m7 pshuflw m7, m5, q2301 pshufhw m7, m7, q2301 paddw m5, m7 %ifidn %1,pp psrld m4, 16 psrld m5, 16 packssdw m4, m5 paddw m4, m1 psraw m4, 6 packuswb m2, m4 movd [r2], m2 psrldq m2, 4 movd [r2 + r3], m2 psrldq m2, 4 movd [r2 + 2 * r3], m2 psrldq m2, 4 movd [r2 + r4], m2 %elifidn %1,ps psrldq m4, 2 psrldq m5, 2 pshufd m4, m4, q3120 pshufd m5, m5, q3120 punpcklqdq m4, m5 psubw m4, m1 movh [r2 + 2 * r3], m4 movhps [r2 + r4], m4 %endif %if x < %2/4 lea r2, [r2 + 4 * r3] %endif %assign x x+1 %endrep RET %endmacro FILTER_V4_W4_H4_sse2 pp, 4 FILTER_V4_W4_H4_sse2 pp, 8 FILTER_V4_W4_H4_sse2 pp, 16 FILTER_V4_W4_H4_sse2 pp, 32 FILTER_V4_W4_H4_sse2 ps, 4 FILTER_V4_W4_H4_sse2 ps, 8 FILTER_V4_W4_H4_sse2 ps, 16 FILTER_V4_W4_H4_sse2 ps, 32 ;----------------------------------------------------------------------------- ;void interp_4tap_vert_%1_6x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W6_H4_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_6x%2, 4, 7, 10 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m6, [r5 + r4] mova m5, [r5 + r4 + 16] %else mova m6, [tab_ChromaCoeffV + r4] mova m5, [tab_ChromaCoeffV + r4 + 16] %endif %ifidn %1,pp mova m4, [pw_32] %elifidn %1,ps mova m4, [pw_2000] add r3d, r3d %endif lea r5, [3 * r1] %assign x 1 %rep %2/4 movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 movhlps m7, m0 punpcklbw m0, m9 punpcklbw m7, m9 pmaddwd m0, m6 pmaddwd m7, m6 packssdw m0, m7 movhlps m8, m2 movq m7, m2 punpcklbw m8, m9 punpcklbw m7, m9 pmaddwd m8, m5 pmaddwd m7, m5 packssdw m7, m8 paddw m0, m7 %ifidn %1,pp paddw m0, m4 psraw m0, 6 packuswb m0, m0 movd [r2], m0 pextrw r6d, m0, 2 mov [r2 + 4], r6w %elifidn %1,ps psubw m0, m4 movh [r2], m0 pshufd m0, m0, 2 movd [r2 + 8], m0 %endif lea r0, [r0 + 4 * r1] movq m0, [r0] punpcklbw m3, m0 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m6 pmaddwd m8, m6 packssdw m1, m8 movhlps m8, m3 movq m7, m3 punpcklbw m8, m9 punpcklbw m7, m9 pmaddwd m8, m5 pmaddwd m7, m5 packssdw m7, m8 paddw m1, m7 %ifidn %1,pp paddw m1, m4 psraw m1, 6 packuswb m1, m1 movd [r2 + r3], m1 pextrw r6d, m1, 2 mov [r2 + r3 + 4], r6w %elifidn %1,ps psubw m1, m4 movh [r2 + r3], m1 pshufd m1, m1, 2 movd [r2 + r3 + 8], m1 %endif movq m1, [r0 + r1] punpcklbw m7, m0, m1 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m6 pmaddwd m8, m6 packssdw m2, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m2, m7 lea r2, [r2 + 2 * r3] %ifidn %1,pp paddw m2, m4 psraw m2, 6 packuswb m2, m2 movd [r2], m2 pextrw r6d, m2, 2 mov [r2 + 4], r6w %elifidn %1,ps psubw m2, m4 movh [r2], m2 pshufd m2, m2, 2 movd [r2 + 8], m2 %endif movq m2, [r0 + 2 * r1] punpcklbw m1, m2 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m6 pmaddwd m8, m6 packssdw m3, m8 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m5 pmaddwd m8, m5 packssdw m1, m8 paddw m3, m1 %ifidn %1,pp paddw m3, m4 psraw m3, 6 packuswb m3, m3 movd [r2 + r3], m3 pextrw r6d, m3, 2 mov [r2 + r3 + 4], r6w %elifidn %1,ps psubw m3, m4 movh [r2 + r3], m3 pshufd m3, m3, 2 movd [r2 + r3 + 8], m3 %endif %if x < %2/4 lea r2, [r2 + 2 * r3] %endif %assign x x+1 %endrep RET %endmacro %if ARCH_X86_64 FILTER_V4_W6_H4_sse2 pp, 8 FILTER_V4_W6_H4_sse2 pp, 16 FILTER_V4_W6_H4_sse2 ps, 8 FILTER_V4_W6_H4_sse2 ps, 16 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W8_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_8x%2, 4, 7, 12 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m4, [pw_32] %elifidn %1,ps mova m4, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r6, [tab_ChromaCoeffV] mova m6, [r6 + r4] mova m5, [r6 + r4 + 16] %else mova m6, [tab_ChromaCoeffV + r4] mova m5, [tab_ChromaCoeffV + r4 + 16] %endif movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] movq m3, [r5 + r1] punpcklbw m0, m1 punpcklbw m7, m2, m3 movhlps m8, m0 punpcklbw m0, m9 punpcklbw m8, m9 pmaddwd m0, m6 pmaddwd m8, m6 packssdw m0, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m0, m7 %ifidn %1,pp paddw m0, m4 psraw m0, 6 %elifidn %1,ps psubw m0, m4 movu [r2], m0 %endif movq m11, [r0 + 4 * r1] punpcklbw m1, m2 punpcklbw m7, m3, m11 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m6 pmaddwd m8, m6 packssdw m1, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m1, m7 %ifidn %1,pp paddw m1, m4 psraw m1, 6 packuswb m1, m0 movhps [r2], m1 movh [r2 + r3], m1 %elifidn %1,ps psubw m1, m4 movu [r2 + r3], m1 %endif %if %2 == 2 ;end of 8x2 RET %else lea r6, [r0 + 4 * r1] movq m1, [r6 + r1] punpcklbw m2, m3 punpcklbw m7, m11, m1 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m6 pmaddwd m8, m6 packssdw m2, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m2, m7 %ifidn %1,pp paddw m2, m4 psraw m2, 6 %elifidn %1,ps psubw m2, m4 movu [r2 + 2 * r3], m2 %endif movq m10, [r6 + 2 * r1] punpcklbw m3, m11 punpcklbw m7, m1, m10 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m6 pmaddwd m8, m6 packssdw m3, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m3, m7 lea r5, [r2 + 2 * r3] %ifidn %1,pp paddw m3, m4 psraw m3, 6 packuswb m3, m2 movhps [r2 + 2 * r3], m3 movh [r5 + r3], m3 %elifidn %1,ps psubw m3, m4 movu [r5 + r3], m3 %endif %if %2 == 4 ;end of 8x4 RET %else lea r6, [r6 + 2 * r1] movq m3, [r6 + r1] punpcklbw m11, m1 punpcklbw m7, m10, m3 movhlps m8, m11 punpcklbw m11, m9 punpcklbw m8, m9 pmaddwd m11, m6 pmaddwd m8, m6 packssdw m11, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m5 pmaddwd m8, m5 packssdw m7, m8 paddw m11, m7 %ifidn %1, pp paddw m11, m4 psraw m11, 6 %elifidn %1,ps psubw m11, m4 movu [r2 + 4 * r3], m11 %endif movq m7, [r0 + 8 * r1] punpcklbw m1, m10 punpcklbw m3, m7 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m6 pmaddwd m8, m6 packssdw m1, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m5 pmaddwd m8, m5 packssdw m3, m8 paddw m1, m3 lea r5, [r2 + 4 * r3] %ifidn %1,pp paddw m1, m4 psraw m1, 6 packuswb m1, m11 movhps [r2 + 4 * r3], m1 movh [r5 + r3], m1 %elifidn %1,ps psubw m1, m4 movu [r5 + r3], m1 %endif %if %2 == 6 RET %else %error INVALID macro argument, only 2, 4 or 6! %endif %endif %endif %endmacro %if ARCH_X86_64 FILTER_V4_W8_sse2 pp, 2 FILTER_V4_W8_sse2 pp, 4 FILTER_V4_W8_sse2 pp, 6 FILTER_V4_W8_sse2 ps, 2 FILTER_V4_W8_sse2 ps, 4 FILTER_V4_W8_sse2 ps, 6 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W8_H8_H16_H32_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_8x%2, 4, 6, 11 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m6, [r5 + r4] mova m5, [r5 + r4 + 16] %else mova m6, [tab_ChromaCoeff + r4] mova m5, [tab_ChromaCoeff + r4 + 16] %endif %ifidn %1,pp mova m4, [pw_32] %elifidn %1,ps mova m4, [pw_2000] add r3d, r3d %endif lea r5, [r1 * 3] %assign x 1 %rep %2/4 movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 movhlps m7, m0 punpcklbw m0, m9 punpcklbw m7, m9 pmaddwd m0, m6 pmaddwd m7, m6 packssdw m0, m7 movhlps m8, m2 movq m7, m2 punpcklbw m8, m9 punpcklbw m7, m9 pmaddwd m8, m5 pmaddwd m7, m5 packssdw m7, m8 paddw m0, m7 %ifidn %1,pp paddw m0, m4 psraw m0, 6 %elifidn %1,ps psubw m0, m4 movu [r2], m0 %endif lea r0, [r0 + 4 * r1] movq m10, [r0] punpcklbw m3, m10 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m6 pmaddwd m8, m6 packssdw m1, m8 movhlps m8, m3 movq m7, m3 punpcklbw m8, m9 punpcklbw m7, m9 pmaddwd m8, m5 pmaddwd m7, m5 packssdw m7, m8 paddw m1, m7 %ifidn %1,pp paddw m1, m4 psraw m1, 6 packuswb m0, m1 movh [r2], m0 movhps [r2 + r3], m0 %elifidn %1,ps psubw m1, m4 movu [r2 + r3], m1 %endif movq m1, [r0 + r1] punpcklbw m10, m1 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m6 pmaddwd m8, m6 packssdw m2, m8 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m5 pmaddwd m8, m5 packssdw m10, m8 paddw m2, m10 lea r2, [r2 + 2 * r3] %ifidn %1,pp paddw m2, m4 psraw m2, 6 %elifidn %1,ps psubw m2, m4 movu [r2], m2 %endif movq m7, [r0 + 2 * r1] punpcklbw m1, m7 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m6 pmaddwd m8, m6 packssdw m3, m8 movhlps m8, m1 punpcklbw m1, m9 punpcklbw m8, m9 pmaddwd m1, m5 pmaddwd m8, m5 packssdw m1, m8 paddw m3, m1 %ifidn %1,pp paddw m3, m4 psraw m3, 6 packuswb m2, m3 movh [r2], m2 movhps [r2 + r3], m2 %elifidn %1,ps psubw m3, m4 movu [r2 + r3], m3 %endif %if x < %2/4 lea r2, [r2 + 2 * r3] %endif %endrep RET %endmacro %if ARCH_X86_64 FILTER_V4_W8_H8_H16_H32_sse2 pp, 8 FILTER_V4_W8_H8_H16_H32_sse2 pp, 16 FILTER_V4_W8_H8_H16_H32_sse2 pp, 32 FILTER_V4_W8_H8_H16_H32_sse2 pp, 12 FILTER_V4_W8_H8_H16_H32_sse2 pp, 64 FILTER_V4_W8_H8_H16_H32_sse2 ps, 8 FILTER_V4_W8_H8_H16_H32_sse2 ps, 16 FILTER_V4_W8_H8_H16_H32_sse2 ps, 32 FILTER_V4_W8_H8_H16_H32_sse2 ps, 12 FILTER_V4_W8_H8_H16_H32_sse2 ps, 64 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W12_H2_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_12x%2, 4, 6, 11 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m6, [pw_32] %elifidn %1,ps mova m6, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m1, [r5 + r4] mova m0, [r5 + r4 + 16] %else mova m1, [tab_ChromaCoeffV + r4] mova m0, [tab_ChromaCoeffV + r4 + 16] %endif %assign x 1 %rep %2/2 movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m7, [r0 + r1] punpcklbw m10, m5, m7 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m4, m10 punpckhbw m10, m5, m7 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m2, m10 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m2, m6 psraw m2, 6 packuswb m4, m2 movh [r2], m4 psrldq m4, 8 movd [r2 + 8], m4 %elifidn %1,ps psubw m4, m6 psubw m2, m6 movu [r2], m4 movh [r2 + 16], m2 %endif punpcklbw m4, m3, m5 punpckhbw m3, m5 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m4 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m1 pmaddwd m8, m1 packssdw m3, m8 movu m5, [r0 + 2 * r1] punpcklbw m2, m7, m5 punpckhbw m7, m5 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m0 pmaddwd m8, m0 packssdw m2, m8 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 paddw m4, m2 paddw m3, m7 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m3, m6 psraw m3, 6 packuswb m4, m3 movh [r2 + r3], m4 psrldq m4, 8 movd [r2 + r3 + 8], m4 %elifidn %1,ps psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movh [r2 + r3 + 16], m3 %endif %if x < %2/2 lea r2, [r2 + 2 * r3] %endif %assign x x+1 %endrep RET %endmacro %if ARCH_X86_64 FILTER_V4_W12_H2_sse2 pp, 16 FILTER_V4_W12_H2_sse2 pp, 32 FILTER_V4_W12_H2_sse2 ps, 16 FILTER_V4_W12_H2_sse2 ps, 32 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W16_H2_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_16x%2, 4, 6, 11 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m6, [pw_32] %elifidn %1,ps mova m6, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m1, [r5 + r4] mova m0, [r5 + r4 + 16] %else mova m1, [tab_ChromaCoeffV + r4] mova m0, [tab_ChromaCoeffV + r4 + 16] %endif %assign x 1 %rep %2/2 movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m10, [r0 + r1] punpckhbw m7, m5, m10 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 paddw m2, m7 punpcklbw m7, m5, m10 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 paddw m4, m7 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m2, m6 psraw m2, 6 packuswb m4, m2 movu [r2], m4 %elifidn %1,ps psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 %endif punpcklbw m4, m3, m5 punpckhbw m3, m5 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m1 pmaddwd m8, m1 packssdw m3, m8 movu m5, [r0 + 2 * r1] punpcklbw m2, m10, m5 punpckhbw m10, m5 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m0 pmaddwd m8, m0 packssdw m2, m8 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m4, m2 paddw m3, m10 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m3, m6 psraw m3, 6 packuswb m4, m3 movu [r2 + r3], m4 %elifidn %1,ps psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 %endif %if x < %2/2 lea r2, [r2 + 2 * r3] %endif %assign x x+1 %endrep RET %endmacro %if ARCH_X86_64 FILTER_V4_W16_H2_sse2 pp, 4 FILTER_V4_W16_H2_sse2 pp, 8 FILTER_V4_W16_H2_sse2 pp, 12 FILTER_V4_W16_H2_sse2 pp, 16 FILTER_V4_W16_H2_sse2 pp, 32 FILTER_V4_W16_H2_sse2 pp, 24 FILTER_V4_W16_H2_sse2 pp, 64 FILTER_V4_W16_H2_sse2 ps, 4 FILTER_V4_W16_H2_sse2 ps, 8 FILTER_V4_W16_H2_sse2 ps, 12 FILTER_V4_W16_H2_sse2 ps, 16 FILTER_V4_W16_H2_sse2 ps, 32 FILTER_V4_W16_H2_sse2 ps, 24 FILTER_V4_W16_H2_sse2 ps, 64 %endif ;----------------------------------------------------------------------------- ;void interp_4tap_vert_%1_24%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W24_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_24x%2, 4, 6, 11 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m6, [pw_32] %elifidn %1,ps mova m6, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m1, [r5 + r4] mova m0, [r5 + r4 + 16] %else mova m1, [tab_ChromaCoeffV + r4] mova m0, [tab_ChromaCoeffV + r4 + 16] %endif %assign x 1 %rep %2/2 movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m10, [r5 + r1] punpcklbw m7, m5, m10 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 paddw m4, m7 punpckhbw m7, m5, m10 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 paddw m2, m7 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m2, m6 psraw m2, 6 packuswb m4, m2 movu [r2], m4 %elifidn %1,ps psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 %endif punpcklbw m4, m3, m5 punpckhbw m3, m5 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m1 pmaddwd m8, m1 packssdw m3, m8 movu m2, [r5 + 2 * r1] punpcklbw m5, m10, m2 punpckhbw m10, m2 movhlps m8, m5 punpcklbw m5, m9 punpcklbw m8, m9 pmaddwd m5, m0 pmaddwd m8, m0 packssdw m5, m8 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m4, m5 paddw m3, m10 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m3, m6 psraw m3, 6 packuswb m4, m3 movu [r2 + r3], m4 %elifidn %1,ps psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 %endif movq m2, [r0 + 16] movq m3, [r0 + r1 + 16] movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] punpcklbw m2, m3 punpcklbw m4, m5 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m0 pmaddwd m8, m0 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 paddw m2, m4 %ifidn %1,pp paddw m2, m6 psraw m2, 6 %elifidn %1,ps psubw m2, m6 movu [r2 + 32], m2 %endif movq m3, [r0 + r1 + 16] movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] movq m7, [r5 + 2 * r1 + 16] punpcklbw m3, m4 punpcklbw m5, m7 movhlps m8, m5 punpcklbw m5, m9 punpcklbw m8, m9 pmaddwd m5, m0 pmaddwd m8, m0 packssdw m5, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m1 pmaddwd m8, m1 packssdw m3, m8 paddw m3, m5 %ifidn %1,pp paddw m3, m6 psraw m3, 6 packuswb m2, m3 movh [r2 + 16], m2 movhps [r2 + r3 + 16], m2 %elifidn %1,ps psubw m3, m6 movu [r2 + r3 + 32], m3 %endif %if x < %2/2 mov r0, r5 lea r2, [r2 + 2 * r3] %endif %assign x x+1 %endrep RET %endmacro %if ARCH_X86_64 FILTER_V4_W24_sse2 pp, 32 FILTER_V4_W24_sse2 pp, 64 FILTER_V4_W24_sse2 ps, 32 FILTER_V4_W24_sse2 ps, 64 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W32_sse2 2 INIT_XMM sse2 cglobal interp_4tap_vert_%1_32x%2, 4, 6, 10 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m6, [pw_32] %elifidn %1,ps mova m6, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m1, [r5 + r4] mova m0, [r5 + r4 + 16] %else mova m1, [tab_ChromaCoeffV + r4] mova m0, [tab_ChromaCoeffV + r4 + 16] %endif mov r4d, %2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 lea r5, [r0 + 2 * r1] movu m3, [r5] movu m5, [r5 + r1] punpcklbw m7, m3, m5 punpckhbw m3, m5 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m0 pmaddwd m8, m0 packssdw m3, m8 paddw m4, m7 paddw m2, m3 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m2, m6 psraw m2, 6 packuswb m4, m2 movu [r2], m4 %elifidn %1,ps psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 %endif movu m2, [r0 + 16] movu m3, [r0 + r1 + 16] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 movu m3, [r5 + 16] movu m5, [r5 + r1 + 16] punpcklbw m7, m3, m5 punpckhbw m3, m5 movhlps m8, m7 punpcklbw m7, m9 punpcklbw m8, m9 pmaddwd m7, m0 pmaddwd m8, m0 packssdw m7, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m0 pmaddwd m8, m0 packssdw m3, m8 paddw m4, m7 paddw m2, m3 %ifidn %1,pp paddw m4, m6 psraw m4, 6 paddw m2, m6 psraw m2, 6 packuswb m4, m2 movu [r2 + 16], m4 %elifidn %1,ps psubw m4, m6 psubw m2, m6 movu [r2 + 32], m4 movu [r2 + 48], m2 %endif lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4 jnz .loop RET %endmacro %if ARCH_X86_64 FILTER_V4_W32_sse2 pp, 8 FILTER_V4_W32_sse2 pp, 16 FILTER_V4_W32_sse2 pp, 24 FILTER_V4_W32_sse2 pp, 32 FILTER_V4_W32_sse2 pp, 48 FILTER_V4_W32_sse2 pp, 64 FILTER_V4_W32_sse2 ps, 8 FILTER_V4_W32_sse2 ps, 16 FILTER_V4_W32_sse2 ps, 24 FILTER_V4_W32_sse2 ps, 32 FILTER_V4_W32_sse2 ps, 48 FILTER_V4_W32_sse2 ps, 64 %endif ;----------------------------------------------------------------------------- ; void interp_4tap_vert_%1_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W16n_H2_sse2 3 INIT_XMM sse2 cglobal interp_4tap_vert_%1_%2x%3, 4, 7, 11 mov r4d, r4m sub r0, r1 shl r4d, 5 pxor m9, m9 %ifidn %1,pp mova m7, [pw_32] %elifidn %1,ps mova m7, [pw_2000] add r3d, r3d %endif %ifdef PIC lea r5, [tab_ChromaCoeffV] mova m1, [r5 + r4] mova m0, [r5 + r4 + 16] %else mova m1, [tab_ChromaCoeffV + r4] mova m0, [tab_ChromaCoeffV + r4 + 16] %endif mov r4d, %3/2 .loop: mov r6d, %2/16 .loopW: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m1 pmaddwd m8, m1 packssdw m2, m8 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m6, [r5 + r1] punpckhbw m10, m5, m6 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m2, m10 punpcklbw m10, m5, m6 movhlps m8, m10 punpcklbw m10, m9 punpcklbw m8, m9 pmaddwd m10, m0 pmaddwd m8, m0 packssdw m10, m8 paddw m4, m10 %ifidn %1,pp paddw m4, m7 psraw m4, 6 paddw m2, m7 psraw m2, 6 packuswb m4, m2 movu [r2], m4 %elifidn %1,ps psubw m4, m7 psubw m2, m7 movu [r2], m4 movu [r2 + 16], m2 %endif punpcklbw m4, m3, m5 punpckhbw m3, m5 movhlps m8, m4 punpcklbw m4, m9 punpcklbw m8, m9 pmaddwd m4, m1 pmaddwd m8, m1 packssdw m4, m8 movhlps m8, m3 punpcklbw m3, m9 punpcklbw m8, m9 pmaddwd m3, m1 pmaddwd m8, m1 packssdw m3, m8 movu m5, [r5 + 2 * r1] punpcklbw m2, m6, m5 punpckhbw m6, m5 movhlps m8, m2 punpcklbw m2, m9 punpcklbw m8, m9 pmaddwd m2, m0 pmaddwd m8, m0 packssdw m2, m8 movhlps m8, m6 punpcklbw m6, m9 punpcklbw m8, m9 pmaddwd m6, m0 pmaddwd m8, m0 packssdw m6, m8 paddw m4, m2 paddw m3, m6 %ifidn %1,pp paddw m4, m7 psraw m4, 6 paddw m3, m7 psraw m3, 6 packuswb m4, m3 movu [r2 + r3], m4 add r2, 16 %elifidn %1,ps psubw m4, m7 psubw m3, m7 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 add r2, 32 %endif add r0, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 2 - %2] %ifidn %1,pp lea r2, [r2 + r3 * 2 - %2] %elifidn %1,ps lea r2, [r2 + r3 * 2 - (%2 * 2)] %endif dec r4d jnz .loop RET %endmacro %if ARCH_X86_64 FILTER_V4_W16n_H2_sse2 pp, 64, 64 FILTER_V4_W16n_H2_sse2 pp, 64, 32 FILTER_V4_W16n_H2_sse2 pp, 64, 48 FILTER_V4_W16n_H2_sse2 pp, 48, 64 FILTER_V4_W16n_H2_sse2 pp, 64, 16 FILTER_V4_W16n_H2_sse2 ps, 64, 64 FILTER_V4_W16n_H2_sse2 ps, 64, 32 FILTER_V4_W16n_H2_sse2 ps, 64, 48 FILTER_V4_W16n_H2_sse2 ps, 48, 64 FILTER_V4_W16n_H2_sse2 ps, 64, 16 %endif %macro FILTER_P2S_2_4_sse2 1 movd m2, [r0 + %1] movd m3, [r0 + r1 + %1] punpcklwd m2, m3 movd m3, [r0 + r1 * 2 + %1] movd m4, [r0 + r4 + %1] punpcklwd m3, m4 punpckldq m2, m3 punpcklbw m2, m0 psllw m2, 6 psubw m2, m1 movd [r2 + r3 * 0 + %1 * 2], m2 psrldq m2, 4 movd [r2 + r3 * 1 + %1 * 2], m2 psrldq m2, 4 movd [r2 + r3 * 2 + %1 * 2], m2 psrldq m2, 4 movd [r2 + r5 + %1 * 2], m2 %endmacro %macro FILTER_P2S_4_4_sse2 1 movd m2, [r0 + %1] movd m3, [r0 + r1 + %1] movd m4, [r0 + r1 * 2 + %1] movd m5, [r0 + r4 + %1] punpckldq m2, m3 punpcklbw m2, m0 punpckldq m4, m5 punpcklbw m4, m0 psllw m2, 6 psllw m4, 6 psubw m2, m1 psubw m4, m1 movh [r2 + r3 * 0 + %1 * 2], m2 movh [r2 + r3 * 2 + %1 * 2], m4 movhps [r2 + r3 * 1 + %1 * 2], m2 movhps [r2 + r5 + %1 * 2], m4 %endmacro %macro FILTER_P2S_4_2_sse2 0 movd m2, [r0] movd m3, [r0 + r1] punpckldq m2, m3 punpcklbw m2, m0 psllw m2, 6 psubw m2, [pw_8192] movh [r2], m2 movhps [r2 + r3 * 2], m2 %endmacro %macro FILTER_P2S_8_4_sse2 1 movh m2, [r0 + %1] movh m3, [r0 + r1 + %1] movh m4, [r0 + r1 * 2 + %1] movh m5, [r0 + r4 + %1] punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m5, m0 punpcklbw m4, m0 psllw m2, 6 psllw m3, 6 psllw m5, 6 psllw m4, 6 psubw m2, m1 psubw m3, m1 psubw m4, m1 psubw m5, m1 movu [r2 + r3 * 0 + %1 * 2], m2 movu [r2 + r3 * 1 + %1 * 2], m3 movu [r2 + r3 * 2 + %1 * 2], m4 movu [r2 + r5 + %1 * 2], m5 %endmacro %macro FILTER_P2S_8_2_sse2 1 movh m2, [r0 + %1] movh m3, [r0 + r1 + %1] punpcklbw m2, m0 punpcklbw m3, m0 psllw m2, 6 psllw m3, 6 psubw m2, m1 psubw m3, m1 movu [r2 + r3 * 0 + %1 * 2], m2 movu [r2 + r3 * 1 + %1 * 2], m3 %endmacro ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) ;----------------------------------------------------------------------------- %macro FILTER_PIX_TO_SHORT_sse2 2 INIT_XMM sse2 cglobal filterPixelToShort_%1x%2, 4, 6, 6 pxor m0, m0 %if %2 == 2 %if %1 == 4 FILTER_P2S_4_2_sse2 %elif %1 == 8 add r3d, r3d mova m1, [pw_8192] FILTER_P2S_8_2_sse2 0 %endif %else add r3d, r3d mova m1, [pw_8192] lea r4, [r1 * 3] lea r5, [r3 * 3] %assign y 1 %rep %2/4 %assign x 0 %rep %1/8 FILTER_P2S_8_4_sse2 x %if %2 == 6 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] FILTER_P2S_8_2_sse2 x %endif %assign x x+8 %endrep %rep (%1 % 8)/4 FILTER_P2S_4_4_sse2 x %assign x x+4 %endrep %rep (%1 % 4)/2 FILTER_P2S_2_4_sse2 x %endrep %if y < %2/4 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] %assign y y+1 %endif %endrep %endif RET %endmacro FILTER_PIX_TO_SHORT_sse2 2, 4 FILTER_PIX_TO_SHORT_sse2 2, 8 FILTER_PIX_TO_SHORT_sse2 2, 16 FILTER_PIX_TO_SHORT_sse2 4, 2 FILTER_PIX_TO_SHORT_sse2 4, 4 FILTER_PIX_TO_SHORT_sse2 4, 8 FILTER_PIX_TO_SHORT_sse2 4, 16 FILTER_PIX_TO_SHORT_sse2 4, 32 FILTER_PIX_TO_SHORT_sse2 6, 8 FILTER_PIX_TO_SHORT_sse2 6, 16 FILTER_PIX_TO_SHORT_sse2 8, 2 FILTER_PIX_TO_SHORT_sse2 8, 4 FILTER_PIX_TO_SHORT_sse2 8, 6 FILTER_PIX_TO_SHORT_sse2 8, 8 FILTER_PIX_TO_SHORT_sse2 8, 12 FILTER_PIX_TO_SHORT_sse2 8, 16 FILTER_PIX_TO_SHORT_sse2 8, 32 FILTER_PIX_TO_SHORT_sse2 8, 64 FILTER_PIX_TO_SHORT_sse2 12, 16 FILTER_PIX_TO_SHORT_sse2 12, 32 FILTER_PIX_TO_SHORT_sse2 16, 4 FILTER_PIX_TO_SHORT_sse2 16, 8 FILTER_PIX_TO_SHORT_sse2 16, 12 FILTER_PIX_TO_SHORT_sse2 16, 16 FILTER_PIX_TO_SHORT_sse2 16, 24 FILTER_PIX_TO_SHORT_sse2 16, 32 FILTER_PIX_TO_SHORT_sse2 16, 64 FILTER_PIX_TO_SHORT_sse2 24, 32 FILTER_PIX_TO_SHORT_sse2 24, 64 FILTER_PIX_TO_SHORT_sse2 32, 8 FILTER_PIX_TO_SHORT_sse2 32, 16 FILTER_PIX_TO_SHORT_sse2 32, 24 FILTER_PIX_TO_SHORT_sse2 32, 32 FILTER_PIX_TO_SHORT_sse2 32, 48 FILTER_PIX_TO_SHORT_sse2 32, 64 FILTER_PIX_TO_SHORT_sse2 48, 64 FILTER_PIX_TO_SHORT_sse2 64, 16 FILTER_PIX_TO_SHORT_sse2 64, 32 FILTER_PIX_TO_SHORT_sse2 64, 48 FILTER_PIX_TO_SHORT_sse2 64, 64 %macro FILTER_H4_w2_2 3 movh %2, [srcq - 1] pshufb %2, %2, Tm0 movh %1, [srcq + srcstrideq - 1] pshufb %1, %1, Tm0 punpcklqdq %2, %1 pmaddubsw %2, coef2 phaddw %2, %2 pmulhrsw %2, %3 packuswb %2, %2 movd r4, %2 mov [dstq], r4w shr r4, 16 mov [dstq + dststrideq], r4w %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 2 FILTER_H4_w2_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endrep RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 4 FILTER_H4_w2_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endrep RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] mov r5d, 16/2 .loop: FILTER_H4_w2_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] dec r5d jnz .loop RET %macro FILTER_H4_w4_2 3 movh %2, [srcq - 1] pshufb %2, %2, Tm0 pmaddubsw %2, coef2 movh %1, [srcq + srcstrideq - 1] pshufb %1, %1, Tm0 pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 packuswb %2, %2 movd [dstq], %2 palignr %2, %2, 4 movd [dstq + dststrideq], %2 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] FILTER_H4_w4_2 t0, t1, t2 RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 2 FILTER_H4_w4_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endrep RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 4 FILTER_H4_w4_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endrep RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] %rep 8 FILTER_H4_w4_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] %endrep RET ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride %define coef2 m4 %define Tm0 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] mov r5d, 32/2 .loop: FILTER_H4_w4_2 t0, t1, t2 lea srcq, [srcq + srcstrideq * 2] lea dstq, [dstq + dststrideq * 2] dec r5d jnz .loop RET ALIGN 32 const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 %macro FILTER_H4_w6 3 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 packuswb %2, %2 movd [dstq], %2 pextrw [dstq + 4], %2, 2 %endmacro %macro FILTER_H4_w8 3 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 packuswb %2, %2 movh [dstq], %2 %endmacro %macro FILTER_H4_w12 3 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 movu %1, [srcq - 1 + 8] pshufb %1, %1, Tm0 pmaddubsw %1, coef2 phaddw %1, %1 pmulhrsw %1, %3 packuswb %2, %1 movh [dstq], %2 pextrd [dstq + 8], %2, 2 %endmacro %macro FILTER_H4_w16 4 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq - 1 + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 pmulhrsw %2, %3 pmulhrsw %4, %3 packuswb %2, %4 movu [dstq], %2 %endmacro %macro FILTER_H4_w24 4 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq - 1 + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 pmulhrsw %2, %3 pmulhrsw %4, %3 packuswb %2, %4 movu [dstq], %2 movu %1, [srcq - 1 + 16] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 pmulhrsw %2, %3 packuswb %2, %2 movh [dstq + 16], %2 %endmacro %macro FILTER_H4_w32 4 movu %1, [srcq - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq - 1 + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 pmulhrsw %2, %3 pmulhrsw %4, %3 packuswb %2, %4 movu [dstq], %2 movu %1, [srcq - 1 + 16] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq - 1 + 24] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 pmulhrsw %2, %3 pmulhrsw %4, %3 packuswb %2, %4 movu [dstq + 16], %2 %endmacro %macro FILTER_H4_w16o 5 movu %1, [srcq + %5 - 1] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + %5 - 1 + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 pmulhrsw %2, %3 pmulhrsw %4, %3 packuswb %2, %4 movu [dstq + %5], %2 %endmacro %macro FILTER_H4_w48 4 FILTER_H4_w16o %1, %2, %3, %4, 0 FILTER_H4_w16o %1, %2, %3, %4, 16 FILTER_H4_w16o %1, %2, %3, %4, 32 %endmacro %macro FILTER_H4_w64 4 FILTER_H4_w16o %1, %2, %3, %4, 0 FILTER_H4_w16o %1, %2, %3, %4, 16 FILTER_H4_w16o %1, %2, %3, %4, 32 FILTER_H4_w16o %1, %2, %3, %4, 48 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro IPFILTER_CHROMA 2 INIT_XMM sse4 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride %define coef2 m5 %define Tm0 m4 %define Tm1 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif mov r5d, %2 pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] .loop: FILTER_H4_w%1 t0, t1, t2 add srcq, srcstrideq add dstq, dststrideq dec r5d jnz .loop RET %endmacro IPFILTER_CHROMA 6, 8 IPFILTER_CHROMA 8, 2 IPFILTER_CHROMA 8, 4 IPFILTER_CHROMA 8, 6 IPFILTER_CHROMA 8, 8 IPFILTER_CHROMA 8, 16 IPFILTER_CHROMA 8, 32 IPFILTER_CHROMA 12, 16 IPFILTER_CHROMA 6, 16 IPFILTER_CHROMA 8, 12 IPFILTER_CHROMA 8, 64 IPFILTER_CHROMA 12, 32 ;----------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro IPFILTER_CHROMA_W 2 INIT_XMM sse4 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride %define coef2 m6 %define Tm0 m5 %define Tm1 m4 %define t3 m3 %define t2 m2 %define t1 m1 %define t0 m0 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] movd coef2, [r5 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif mov r5d, %2 pshufd coef2, coef2, 0 mova t2, [pw_512] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] .loop: FILTER_H4_w%1 t0, t1, t2, t3 add srcq, srcstrideq add dstq, dststrideq dec r5d jnz .loop RET %endmacro IPFILTER_CHROMA_W 16, 4 IPFILTER_CHROMA_W 16, 8 IPFILTER_CHROMA_W 16, 12 IPFILTER_CHROMA_W 16, 16 IPFILTER_CHROMA_W 16, 32 IPFILTER_CHROMA_W 32, 8 IPFILTER_CHROMA_W 32, 16 IPFILTER_CHROMA_W 32, 24 IPFILTER_CHROMA_W 24, 32 IPFILTER_CHROMA_W 32, 32 IPFILTER_CHROMA_W 16, 24 IPFILTER_CHROMA_W 16, 64 IPFILTER_CHROMA_W 32, 48 IPFILTER_CHROMA_W 24, 64 IPFILTER_CHROMA_W 32, 64 IPFILTER_CHROMA_W 64, 64 IPFILTER_CHROMA_W 64, 32 IPFILTER_CHROMA_W 64, 48 IPFILTER_CHROMA_W 48, 64 IPFILTER_CHROMA_W 64, 16 %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst movu %1, %7 pshufb %2, %1, [tab_Lm + 0] pmaddubsw %2, %5 pshufb %3, %1, [tab_Lm + 16] pmaddubsw %3, %5 phaddw %2, %3 pshufb %4, %1, [tab_Lm + 32] pmaddubsw %4, %5 pshufb %1, %1, [tab_Lm + 48] pmaddubsw %1, %5 phaddw %4, %1 phaddw %2, %4 %if %0 == 8 pmulhrsw %2, %6 packuswb %2, %2 movh %8, %2 %endif %endmacro %macro FILTER_H8_W4 2 movu %1, [r0 - 3 + r5] pshufb %2, %1, [tab_Lm] pmaddubsw %2, m3 pshufb m7, %1, [tab_Lm + 16] pmaddubsw m7, m3 phaddw %2, m7 phaddw %2, %2 %endmacro ;---------------------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;---------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_LUMA 3 INIT_XMM sse4 cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] movh m3, [r6 + r4 * 8] %else movh m3, [tab_LumaCoeff + r4 * 8] %endif punpcklqdq m3, m3 %ifidn %3, pp mova m2, [pw_512] %else mova m2, [pw_2000] %endif mov r4d, %2 %ifidn %3, ps add r3, r3 cmp r5m, byte 0 je .loopH lea r6, [r1 + 2 * r1] sub r0, r6 add r4d, 7 %endif .loopH: xor r5, r5 %rep %1 / 8 %ifidn %3, pp FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] %else FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] psubw m1, m2 movu [r2 + 2 * r5], m1 %endif add r5, 8 %endrep %rep (%1 % 8) / 4 FILTER_H8_W4 m0, m1 %ifidn %3, pp pmulhrsw m1, m2 packuswb m1, m1 movd [r2 + r5], m1 %else psubw m1, m2 movh [r2 + 2 * r5], m1 %endif %endrep add r0, r1 add r2, r3 dec r4d jnz .loopH RET %endmacro INIT_YMM avx2 cglobal interp_8tap_horiz_pp_4x4, 4,6,6 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastq m0, [r5 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m1, [tab_Lm] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 sub r0, 3 ; Row 0-1 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] ; Row 2-3 lea r0, [r0 + r1 * 2] vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] pmulhrsw m3, [pw_512] vextracti128 xm4, m3, 1 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0] lea r0, [r3 * 3] movd [r2], xm3 pextrd [r2+r3], xm3, 2 pextrd [r2+r3*2], xm3, 1 pextrd [r2+r0], xm3, 3 RET %macro FILTER_HORIZ_LUMA_AVX2_4xN 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_horiz_pp_4x%1, 4, 6, 9 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastq m0, [r5 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m1, [tab_Lm] mova m2, [pw_1] mova m7, [interp8_hps_shuf] mova m8, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 lea r4, [r1 * 3] lea r5, [r3 * 3] sub r0, 3 %rep %1 / 8 ; Row 0-1 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] ; Row 2-3 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] lea r0, [r0 + r1 * 4] ; Row 4-5 vbroadcasti128 m5, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 phaddd m5, m4 ; DWORD [R5D R5C R4D R4C R5B R5A R4B R4A] ; Row 6-7 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m6, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m6, m1 pmaddubsw m6, m0 pmaddwd m6, m2 phaddd m4, m6 ; DWORD [R7D R7C R6D R6C R7B R7A R6B R6A] packssdw m5, m4 ; WORD [R7D R7C R6D R6C R5D R5C R4D R4C R7B R7A R6B R6A R5B R5A R4B R4A] vpermd m3, m7, m3 vpermd m5, m7, m5 pmulhrsw m3, m8 pmulhrsw m5, m8 packuswb m3, m5 vextracti128 xm5, m3, 1 movd [r2], xm3 pextrd [r2 + r3], xm3, 1 movd [r2 + r3 * 2], xm5 pextrd [r2 + r5], xm5, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm3, 2 pextrd [r2 + r3], xm3, 3 pextrd [r2 + r3 * 2], xm5, 2 pextrd [r2 + r5], xm5, 3 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] %endrep RET %endif %endmacro FILTER_HORIZ_LUMA_AVX2_4xN 8 FILTER_HORIZ_LUMA_AVX2_4xN 16 INIT_YMM avx2 cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastq m0, [r5 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m1, [tab_Lm] mova m2, [tab_Lm + 32] ; register map ; m0 - interpolate coeff ; m1, m2 - shuffle order table sub r0, 3 lea r5, [r1 * 3] lea r4, [r3 * 3] ; Row 0 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m2 pshufb m3, m1 pmaddubsw m3, m0 pmaddubsw m4, m0 phaddw m3, m4 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m2 pshufb m4, m1 pmaddubsw m4, m0 pmaddubsw m5, m0 phaddw m4, m5 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] pmulhrsw m3, [pw_512] ; Row 2 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m2 pshufb m4, m1 pmaddubsw m4, m0 pmaddubsw m5, m0 phaddw m4, m5 ; Row 3 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m6, m5, m2 pshufb m5, m1 pmaddubsw m5, m0 pmaddubsw m6, m0 phaddw m5, m6 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] pmulhrsw m4, [pw_512] packuswb m3, m4 vextracti128 xm4, m3, 1 punpcklwd xm5, xm3, xm4 movq [r2], xm5 movhps [r2 + r3], xm5 punpckhwd xm5, xm3, xm4 movq [r2 + r3 * 2], xm5 movhps [r2 + r4], xm5 RET %macro IPFILTER_LUMA_AVX2_8xN 2 INIT_YMM avx2 cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastq m0, [r5 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m1, [tab_Lm] mova m2, [tab_Lm + 32] ; register map ; m0 - interpolate coeff ; m1, m2 - shuffle order table sub r0, 3 lea r5, [r1 * 3] lea r6, [r3 * 3] mov r4d, %2 / 4 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m2 pshufb m3, m1 pmaddubsw m3, m0 pmaddubsw m4, m0 phaddw m3, m4 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m2 pshufb m4, m1 pmaddubsw m4, m0 pmaddubsw m5, m0 phaddw m4, m5 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] pmulhrsw m3, [pw_512] ; Row 2 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m2 pshufb m4, m1 pmaddubsw m4, m0 pmaddubsw m5, m0 phaddw m4, m5 ; Row 3 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m6, m5, m2 pshufb m5, m1 pmaddubsw m5, m0 pmaddubsw m6, m0 phaddw m5, m6 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] pmulhrsw m4, [pw_512] packuswb m3, m4 vextracti128 xm4, m3, 1 punpcklwd xm5, xm3, xm4 movq [r2], xm5 movhps [r2 + r3], xm5 punpckhwd xm5, xm3, xm4 movq [r2 + r3 * 2], xm5 movhps [r2 + r6], xm5 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jnz .loop RET %endmacro IPFILTER_LUMA_AVX2_8xN 8, 8 IPFILTER_LUMA_AVX2_8xN 8, 16 IPFILTER_LUMA_AVX2_8xN 8, 32 %macro IPFILTER_LUMA_AVX2 2 INIT_YMM avx2 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, %2/2 .loop: ; Row 0 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0 pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m2, m3 pshufb m2, [tab_Tm] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0 pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2], xm4 movu [r2+r3], xm5 lea r0, [r0 + r1 * 2] lea r2, [r2 + r3 * 2] dec r4d jnz .loop RET %endmacro %macro IPFILTER_LUMA_32x_avx2 2 INIT_YMM avx2 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, %2 .loop: ; Row 0 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + 16] pshufb m5, m2, m3 pshufb m2, [tab_Tm] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + 24] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2], xm4 movu [r2 + 16], xm5 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro %macro IPFILTER_LUMA_64x_avx2 2 INIT_YMM avx2 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, %2 .loop: ; Row 0 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + 16] pshufb m5, m2, m3 pshufb m2, [tab_Tm] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + 24] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2], xm4 movu [r2 + 16], xm5 vbroadcasti128 m4, [r0 + 32] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 40] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + 48] pshufb m5, m2, m3 pshufb m2, [tab_Tm] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + 56] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2 +32], xm4 movu [r2 + 48], xm5 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro INIT_YMM avx2 cglobal interp_8tap_horiz_pp_48x64, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, 64 .loop: ; Row 0 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + 16] pshufb m5, m2, m3 pshufb m2, [tab_Tm] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + 24] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2], xm4 movu [r2 + 16], xm5 vbroadcasti128 m4, [r0 + 32] pshufb m5, m4, m3 pshufb m4, [tab_Tm] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 40] pshufb m6, m5, m3 pshufb m5, [tab_Tm] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 pmulhrsw m4, [pw_512] packuswb m4, m4 vpermq m4, m4, 11011000b pshufd xm4, xm4, 11011000b movu [r2 + 32], xm4 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_4x4, 4,6,6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vpbroadcastd m2, [pw_1] vbroadcasti128 m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 ; Row 0-1 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 2-3 lea r0, [r0 + r1 * 2] vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] vinserti128 m4, m4, [r0 + r1], 1 pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, [pw_512] vextracti128 xm4, m3, 1 packuswb xm3, xm4 lea r0, [r3 * 3] movd [r2], xm3 pextrd [r2+r3], xm3, 2 pextrd [r2+r3*2], xm3, 1 pextrd [r2+r0], xm3, 3 RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif dec r0 lea r4, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 pshufb m1, [interp4_hpp_shuf] pmaddubsw m1, m0 pmaddwd m1, [pw_1] vextracti128 xm2, m1, 1 packssdw xm1, xm2 pmulhrsw xm1, [pw_512] packuswb xm1, xm1 lea r4, [r3 * 3] pextrw [r2], xm1, 0 pextrw [r2 + r3], xm1, 1 pextrw [r2 + r3 * 2], xm1, 2 pextrw [r2 + r4], xm1, 3 RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m4, [interp4_hpp_shuf] mova m5, [pw_1] dec r0 lea r4, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 pmulhrsw m1, [pw_512] vextracti128 xm2, m1, 1 packuswb xm1, xm2 lea r4, [r3 * 3] pextrw [r2], xm1, 0 pextrw [r2 + r3], xm1, 1 pextrw [r2 + r3 * 2], xm1, 4 pextrw [r2 + r4], xm1, 5 lea r2, [r2 + r3 * 4] pextrw [r2], xm1, 2 pextrw [r2 + r3], xm1, 3 pextrw [r2 + r3 * 2], xm1, 6 pextrw [r2 + r4], xm1, 7 RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_32x32, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 32 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b movu [r2], m3 lea r2, [r2 + r3] lea r0, [r0 + r1] dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m6, [pw_512] mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 8 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movu [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r4d jnz .loop RET ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- IPFILTER_LUMA 4, 4, pp IPFILTER_LUMA 4, 8, pp IPFILTER_LUMA 12, 16, pp IPFILTER_LUMA 4, 16, pp INIT_YMM avx2 cglobal interp_4tap_horiz_pp_8x8, 4,6,6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif movu m1, [tab_Tm] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 sub r0, 1 mov r4d, 2 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, [pw_512] lea r0, [r0 + r1 * 2] ; Row 2 vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 ; Row 3 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, [pw_512] packuswb m3, m4 mova m5, [interp_4tap_8x8_horiz_shuf] vpermd m3, m5, m3 vextracti128 xm4, m3, 1 movq [r2], xm3 movhps [r2 + r3], xm3 lea r2, [r2 + r3 * 2] movq [r2], xm4 movhps [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1*2] dec r4d jnz .loop RET IPFILTER_LUMA_AVX2 16, 4 IPFILTER_LUMA_AVX2 16, 8 IPFILTER_LUMA_AVX2 16, 12 IPFILTER_LUMA_AVX2 16, 16 IPFILTER_LUMA_AVX2 16, 32 IPFILTER_LUMA_AVX2 16, 64 IPFILTER_LUMA_32x_avx2 32 , 8 IPFILTER_LUMA_32x_avx2 32 , 16 IPFILTER_LUMA_32x_avx2 32 , 24 IPFILTER_LUMA_32x_avx2 32 , 32 IPFILTER_LUMA_32x_avx2 32 , 64 IPFILTER_LUMA_64x_avx2 64 , 64 IPFILTER_LUMA_64x_avx2 64 , 48 IPFILTER_LUMA_64x_avx2 64 , 32 IPFILTER_LUMA_64x_avx2 64 , 16 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_8x2, 4, 6, 5 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [tab_Tm] mova m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, [pw_512] vextracti128 xm4, m3, 1 packuswb xm3, xm4 pshufd xm3, xm3, 11011000b movq [r2], xm3 movhps [r2 + r3], xm3 RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_8x6, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [tab_Tm] mova m2, [pw_1] mova m6, [pw_512] lea r4, [r1 * 3] lea r5, [r3 * 3] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 2 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 ; Row 3 vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 mova m5, [interp8_hps_shuf] vpermd m3, m5, m3 vextracti128 xm4, m3, 1 movq [r2], xm3 movhps [r2 + r3], xm3 movq [r2 + r3 * 2], xm4 movhps [r2 + r5], xm4 lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] ; Row 4 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 5 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vextracti128 xm4, m3, 1 packuswb xm3, xm4 pshufd xm3, xm3, 11011000b movq [r2], xm3 movhps [r2 + r3], xm3 RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_6x8, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [tab_Tm] mova m2, [pw_1] mova m6, [pw_512] lea r4, [r1 * 3] lea r5, [r3 * 3] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 %rep 2 ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 2 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 ; Row 3 vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vextracti128 xm4, m3, 1 movd [r2], xm3 pextrw [r2 + 4], xm4, 0 pextrd [r2 + r3], xm3, 1 pextrw [r2 + r3 + 4], xm4, 2 pextrd [r2 + r3 * 2], xm3, 2 pextrw [r2 + r3 * 2 + 4], xm4, 4 pextrd [r2 + r5], xm3, 3 pextrw [r2 + r5 + 4], xm4, 6 lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] %endrep RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; %macro IPFILTER_CHROMA_HPS_64xN 1 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_64x%1, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, %1 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], m3 vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2 + 32], m3 vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2 + 64], m3 vbroadcasti128 m3, [r0 + 48] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 56] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2 + 96], m3 add r2, r3 add r0, r1 dec r6d jnz .loop RET %endmacro IPFILTER_CHROMA_HPS_64xN 64 IPFILTER_CHROMA_HPS_64xN 32 IPFILTER_CHROMA_HPS_64xN 48 IPFILTER_CHROMA_HPS_64xN 16 ;----------------------------------------------------------------------------------------------------------------------------- ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_LUMA_PS_4xN_AVX2 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m1, [tab_Lm] add r3d, r3d vbroadcasti128 m2, [pw_2000] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - pw_2000 sub r0, 3 test r5d, r5d mov r5d, %1 ; loop count variable - height jz .preloop lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride sub r0, r6 ; r0(src) - 3 * srcStride add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) .preloop: lea r6, [r3 * 3] .loop ; Row 0-1 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 ; shuffled based on the col order tab_Lm pmaddubsw m3, m0 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] ; Row 2-3 lea r0, [r0 + r1 * 2] ;3rd row(i.e 2nd row) vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] phaddw m3, m4 ; all rows and col completed. mova m5, [interp8_hps_shuf] vpermd m3, m5, m3 psubw m3, m2 vextracti128 xm4, m3, 1 movq [r2], xm3 ;row 0 movhps [r2 + r3], xm3 ;row 1 movq [r2 + r3 * 2], xm4 ;row 2 movhps [r2 + r6], xm4 ;row 3 lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4) sub r5d, 4 jz .end cmp r5d, 4 jge .loop ; Row 8-9 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] ; Row 10 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] phaddw m3, m4 vpermd m3, m5, m3 ; m5 don't broken in above psubw m3, m2 vextracti128 xm4, m3, 1 movq [r2], xm3 movhps [r2 + r3], xm3 movq [r2 + r3 * 2], xm4 .end RET %endif %endmacro IPFILTER_LUMA_PS_4xN_AVX2 4 IPFILTER_LUMA_PS_4xN_AVX2 8 IPFILTER_LUMA_PS_4xN_AVX2 16 %macro IPFILTER_LUMA_PS_8xN_AVX2 1 ; TODO: verify and enable on X86 mode %if ARCH_X86_64 == 1 ; void filter_hps(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) INIT_YMM avx2 cglobal interp_8tap_horiz_ps_8x%1, 4,7,6 mov r5d, r5m mov r4d, r4m shl r4d, 7 %ifdef PIC lea r6, [pb_LumaCoeffVer] add r6, r4 %else lea r6, [pb_LumaCoeffVer + r4] %endif add r3d, r3d vpbroadcastd m0, [pw_2000] sub r0, 3 lea r4, [pb_8tap_hps_0] vbroadcasti128 m5, [r4 + 0 * mmsize] ; check row count extend for interpolateHV test r5d, r5d; mov r5d, %1 jz .enter_loop lea r4, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride sub r0, r4 ; r0(src)-r8 add r5d, 8-1-2 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) .enter_loop: lea r4, [pb_8tap_hps_0] ; ***** register map ***** ; m0 - pw_2000 ; r4 - base pointer of shuffle order table ; r5 - count of loop ; r6 - point to LumaCoeff .loop: ; Row 0-1 movu xm1, [r0] movu xm2, [r0 + r1] vinserti128 m1, m1, xm2, 1 pshufb m2, m1, m5 ; [0 1 1 2 2 3 3 4 ...] pshufb m3, m1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] pshufb m4, m1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] pshufb m1, m1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] pmaddubsw m2, [r6 + 0 * mmsize] pmaddubsw m3, [r6 + 1 * mmsize] pmaddubsw m4, [r6 + 2 * mmsize] pmaddubsw m1, [r6 + 3 * mmsize] paddw m2, m3 paddw m1, m4 paddw m1, m2 psubw m1, m0 vextracti128 xm2, m1, 1 movu [r2], xm1 ; row 0 movu [r2 + r3], xm2 ; row 1 lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) sub r5d, 2 jg .loop jz .end ; last row movu xm1, [r0] pshufb xm2, xm1, xm5 ; [0 1 1 2 2 3 3 4 ...] pshufb xm3, xm1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] pshufb xm4, xm1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] pshufb xm1, xm1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] pmaddubsw xm2, [r6 + 0 * mmsize] pmaddubsw xm3, [r6 + 1 * mmsize] pmaddubsw xm4, [r6 + 2 * mmsize] pmaddubsw xm1, [r6 + 3 * mmsize] paddw xm2, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm1, xm0 movu [r2], xm1 ;row 0 .end RET %endif %endmacro ; IPFILTER_LUMA_PS_8xN_AVX2 IPFILTER_LUMA_PS_8xN_AVX2 4 IPFILTER_LUMA_PS_8xN_AVX2 8 IPFILTER_LUMA_PS_8xN_AVX2 16 IPFILTER_LUMA_PS_8xN_AVX2 32 %macro IPFILTER_LUMA_PS_16x_AVX2 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r9, %2 ;height add r3d, r3d vbroadcasti128 m2, [pw_2000] ; register map ; m0 - interpolate coeff ; m1 , m6 - shuffle order table ; m2 - pw_2000 xor r7, r7 ; loop count variable sub r0, 3 test r5d, r5d jz .label lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride sub r0, r8 ; r0(src)-r8 add r9, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) .label ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m6 ;row 1 (col 4 to 7) pshufb m4, m1 ;row 1 (col 0 to 3) pmaddubsw m4, m0 pmaddubsw m5, m0 phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] phaddw m3, m4 ; all rows and col completed. mova m5, [interp8_hps_shuf] vpermd m3, m5, m3 psubw m3, m2 movu [r2], m3 ;row 0 lea r0, [r0 + r1] ; first loop src ->5th row(i.e 4) lea r2, [r2 + r3] ; first loop dst ->5th row(i.e 4) dec r9d jnz .label RET %endif %endmacro IPFILTER_LUMA_PS_16x_AVX2 16 , 16 IPFILTER_LUMA_PS_16x_AVX2 16 , 8 IPFILTER_LUMA_PS_16x_AVX2 16 , 12 IPFILTER_LUMA_PS_16x_AVX2 16 , 4 IPFILTER_LUMA_PS_16x_AVX2 16 , 32 IPFILTER_LUMA_PS_16x_AVX2 16 , 64 ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro IPFILTER_LUMA_PP_W8 2 INIT_XMM sse4 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] movh m3, [r5 + r4 * 8] %else movh m3, [tab_LumaCoeff + r4 * 8] %endif pshufd m0, m3, 0 ; m0 = coeff-L pshufd m1, m3, 0x55 ; m1 = coeff-H lea r5, [tab_Tm] ; r5 = shuffle mova m2, [pw_512] ; m2 = 512 mov r4d, %2 .loopH: %assign x 0 %rep %1 / 8 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] pmaddubsw m4, m0 pmaddubsw m6, m5, m1 pmaddubsw m5, m0 pmaddubsw m3, m1 paddw m4, m6 paddw m5, m3 phaddw m4, m5 pmulhrsw m4, m2 packuswb m4, m4 movh [r2 + x], m4 %assign x x+8 %endrep add r0, r1 add r2, r3 dec r4d jnz .loopH RET %endmacro IPFILTER_LUMA_PP_W8 8, 4 IPFILTER_LUMA_PP_W8 8, 8 IPFILTER_LUMA_PP_W8 8, 16 IPFILTER_LUMA_PP_W8 8, 32 IPFILTER_LUMA_PP_W8 16, 4 IPFILTER_LUMA_PP_W8 16, 8 IPFILTER_LUMA_PP_W8 16, 12 IPFILTER_LUMA_PP_W8 16, 16 IPFILTER_LUMA_PP_W8 16, 32 IPFILTER_LUMA_PP_W8 16, 64 IPFILTER_LUMA_PP_W8 24, 32 IPFILTER_LUMA_PP_W8 32, 8 IPFILTER_LUMA_PP_W8 32, 16 IPFILTER_LUMA_PP_W8 32, 24 IPFILTER_LUMA_PP_W8 32, 32 IPFILTER_LUMA_PP_W8 32, 64 IPFILTER_LUMA_PP_W8 48, 64 IPFILTER_LUMA_PP_W8 64, 16 IPFILTER_LUMA_PP_W8 64, 32 IPFILTER_LUMA_PP_W8 64, 48 IPFILTER_LUMA_PP_W8 64, 64 ;---------------------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;---------------------------------------------------------------------------------------------------------------------------- IPFILTER_LUMA 4, 4, ps IPFILTER_LUMA 8, 8, ps IPFILTER_LUMA 8, 4, ps IPFILTER_LUMA 4, 8, ps IPFILTER_LUMA 16, 16, ps IPFILTER_LUMA 16, 8, ps IPFILTER_LUMA 8, 16, ps IPFILTER_LUMA 16, 12, ps IPFILTER_LUMA 12, 16, ps IPFILTER_LUMA 16, 4, ps IPFILTER_LUMA 4, 16, ps IPFILTER_LUMA 32, 32, ps IPFILTER_LUMA 32, 16, ps IPFILTER_LUMA 16, 32, ps IPFILTER_LUMA 32, 24, ps IPFILTER_LUMA 24, 32, ps IPFILTER_LUMA 32, 8, ps IPFILTER_LUMA 8, 32, ps IPFILTER_LUMA 64, 64, ps IPFILTER_LUMA 64, 32, ps IPFILTER_LUMA 32, 64, ps IPFILTER_LUMA 64, 48, ps IPFILTER_LUMA 48, 64, ps IPFILTER_LUMA 64, 16, ps IPFILTER_LUMA 16, 64, ps ;----------------------------------------------------------------------------- ; Interpolate HV ;----------------------------------------------------------------------------- %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] mova %5, [r0 + (%6 + 0) * 16] mova %1, [r0 + (%6 + 1) * 16] mova %2, [r0 + (%6 + 2) * 16] punpcklwd %3, %5, %1 punpckhwd %5, %1 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] punpcklwd %4, %1, %2 punpckhwd %1, %2 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] %endmacro ; FILTER_HV8_START %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] mova %8, [r0 + (%9 + 0) * 16] mova %1, [r0 + (%9 + 1) * 16] punpcklwd %7, %2, %8 punpckhwd %2, %8 pmaddwd %7, [r5 + %10 * 16] pmaddwd %2, [r5 + %10 * 16] paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 paddd %5, %2 ; R0 = H[0+1+2+3] punpcklwd %7, %8, %1 punpckhwd %8, %1 pmaddwd %7, [r5 + %10 * 16] pmaddwd %8, [r5 + %10 * 16] paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 paddd %6, %8 ; R1 = H[1+2+3+4] %endmacro ; FILTER_HV8_MID ; Round and Saturate %macro FILTER_HV8_END 4 ; output in [1, 3] paddd %1, [pd_526336] paddd %2, [pd_526336] paddd %3, [pd_526336] paddd %4, [pd_526336] psrad %1, 12 psrad %2, 12 psrad %3, 12 psrad %4, 12 packssdw %1, %2 packssdw %3, %4 ; TODO: is merge better? I think this way is short dependency link packuswb %1, %3 %endmacro ; FILTER_HV8_END ;----------------------------------------------------------------------------- ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- INIT_XMM ssse3 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 %define coef m7 %define stk_buf rsp mov r4d, r4m mov r5d, r5m %ifdef PIC lea r6, [tab_LumaCoeff] movh coef, [r6 + r4 * 8] %else movh coef, [tab_LumaCoeff + r4 * 8] %endif punpcklqdq coef, coef ; move to row -3 lea r6, [r1 + r1 * 2] sub r0, r6 xor r6, r6 mov r4, rsp .loopH: FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] psubw m1, [pw_2000] mova [r4], m1 add r0, r1 add r4, 16 inc r6 cmp r6, 8+7 jnz .loopH ; ready to phase V ; Here all of mN is free ; load coeff table shl r5, 6 lea r6, [tab_LumaCoeffV] lea r5, [r5 + r6] ; load intermedia buffer mov r0, stk_buf ; register mapping ; r0 - src ; r5 - coeff ; r6 - loop_i ; let's go xor r6, r6 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache .loopV: FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 FILTER_HV8_END m3, m0, m4, m1 movh [r2], m3 movhps [r2 + r3], m3 lea r0, [r0 + 16 * 2] lea r2, [r2 + r3 * 2] inc r6 cmp r6, 8/2 jnz .loopV RET ;----------------------------------------------------------------------------- ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- INIT_XMM sse3 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 mov r4d, r4m mov r5d, r5m add r4d, r4d pxor m6, m6 %ifdef PIC lea r6, [tabw_LumaCoeff] mova m3, [r6 + r4 * 8] %else mova m3, [tabw_LumaCoeff + r4 * 8] %endif ; move to row -3 lea r6, [r1 + r1 * 2] sub r0, r6 mov r4, rsp %assign x 0 ;needed for FILTER_H8_W8_sse2 macro %assign y 1 %rep 15 FILTER_H8_W8_sse2 psubw m1, [pw_2000] mova [r4], m1 %if y < 15 add r0, r1 add r4, 16 %endif %assign y y+1 %endrep ; ready to phase V ; Here all of mN is free ; load coeff table shl r5, 6 lea r6, [tab_LumaCoeffV] lea r5, [r5 + r6] ; load intermedia buffer mov r0, rsp ; register mapping ; r0 - src ; r5 - coeff ; let's go %assign y 1 %rep 4 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 FILTER_HV8_END m3, m0, m4, m1 movh [r2], m3 movhps [r2 + r3], m3 %if y < 4 lea r0, [r0 + 16 * 2] lea r2, [r2 + r3 * 2] %endif %assign y y+1 %endrep RET ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif lea r4, [r1 * 3] lea r5, [r0 + 4 * r1] pshufb m0, [tab_Cm] mova m1, [pw_512] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 movd m6, [r5] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 pmulhrsw m2, m1 movd m7, [r5 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r5 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 packuswb m2, m4 pextrw [r2], m2, 0 pextrw [r2 + r3], m2, 2 lea r2, [r2 + 2 * r3] pextrw [r2], m2, 4 pextrw [r2 + r3], m2, 6 RET %macro FILTER_VER_CHROMA_AVX2_2x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 mov r4d, r4m shl r4d, 5 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff_V] add r5, r4 %else lea r5, [tab_ChromaCoeff_V + r4] %endif lea r4, [r1 * 3] pinsrw xm1, [r0], 0 pinsrw xm1, [r0 + r1], 1 pinsrw xm1, [r0 + r1 * 2], 2 pinsrw xm1, [r0 + r4], 3 lea r0, [r0 + r1 * 4] pinsrw xm1, [r0], 4 pinsrw xm1, [r0 + r1], 5 pinsrw xm1, [r0 + r1 * 2], 6 pshufb xm0, xm1, [interp_vert_shuf] pshufb xm1, [interp_vert_shuf + 32] vinserti128 m0, m0, xm1, 1 pmaddubsw m0, [r5] vextracti128 xm1, m0, 1 paddw xm0, xm1 %ifidn %1,pp pmulhrsw xm0, [pw_512] packuswb xm0, xm0 lea r4, [r3 * 3] pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + r3 * 2], xm0, 2 pextrw [r2 + r4], xm0, 3 %else add r3d, r3d lea r4, [r3 * 3] psubw xm0, [pw_2000] movd [r2], xm0 pextrd [r2 + r3], xm0, 1 pextrd [r2 + r3 * 2], xm0, 2 pextrd [r2 + r4], xm0, 3 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_2x4 pp FILTER_VER_CHROMA_AVX2_2x4 ps %macro FILTER_VER_CHROMA_AVX2_2x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 mov r4d, r4m shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] pinsrw xm1, [r0], 0 pinsrw xm1, [r0 + r1], 1 pinsrw xm1, [r0 + r1 * 2], 2 pinsrw xm1, [r0 + r4], 3 lea r0, [r0 + r1 * 4] pinsrw xm1, [r0], 4 pinsrw xm1, [r0 + r1], 5 pinsrw xm1, [r0 + r1 * 2], 6 pinsrw xm1, [r0 + r4], 7 movhlps xm0, xm1 lea r0, [r0 + r1 * 4] pinsrw xm0, [r0], 4 pinsrw xm0, [r0 + r1], 5 pinsrw xm0, [r0 + r1 * 2], 6 vinserti128 m1, m1, xm0, 1 pshufb m0, m1, [interp_vert_shuf] pshufb m1, [interp_vert_shuf + 32] pmaddubsw m0, [r5] pmaddubsw m1, [r5 + 1 * mmsize] paddw m0, m1 %ifidn %1,pp pmulhrsw m0, [pw_512] vextracti128 xm1, m0, 1 packuswb xm0, xm1 lea r4, [r3 * 3] pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + r3 * 2], xm0, 2 pextrw [r2 + r4], xm0, 3 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 4 pextrw [r2 + r3], xm0, 5 pextrw [r2 + r3 * 2], xm0, 6 pextrw [r2 + r4], xm0, 7 %else add r3d, r3d lea r4, [r3 * 3] psubw m0, [pw_2000] vextracti128 xm1, m0, 1 movd [r2], xm0 pextrd [r2 + r3], xm0, 1 pextrd [r2 + r3 * 2], xm0, 2 pextrd [r2 + r4], xm0, 3 lea r2, [r2 + r3 * 4] movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r4], xm1, 3 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_2x8 pp FILTER_VER_CHROMA_AVX2_2x8 ps %macro FILTER_VER_CHROMA_AVX2_2x16 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x16, 4, 6, 3 mov r4d, r4m shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] movd xm1, [r0] pinsrw xm1, [r0 + r1], 1 pinsrw xm1, [r0 + r1 * 2], 2 pinsrw xm1, [r0 + r4], 3 lea r0, [r0 + r1 * 4] pinsrw xm1, [r0], 4 pinsrw xm1, [r0 + r1], 5 pinsrw xm1, [r0 + r1 * 2], 6 pinsrw xm1, [r0 + r4], 7 lea r0, [r0 + r1 * 4] pinsrw xm0, [r0], 4 pinsrw xm0, [r0 + r1], 5 pinsrw xm0, [r0 + r1 * 2], 6 pinsrw xm0, [r0 + r4], 7 punpckhqdq xm0, xm1, xm0 vinserti128 m1, m1, xm0, 1 pshufb m2, m1, [interp_vert_shuf] pshufb m1, [interp_vert_shuf + 32] pmaddubsw m2, [r5] pmaddubsw m1, [r5 + 1 * mmsize] paddw m2, m1 lea r0, [r0 + r1 * 4] pinsrw xm1, [r0], 4 pinsrw xm1, [r0 + r1], 5 pinsrw xm1, [r0 + r1 * 2], 6 pinsrw xm1, [r0 + r4], 7 punpckhqdq xm1, xm0, xm1 lea r0, [r0 + r1 * 4] pinsrw xm0, [r0], 4 pinsrw xm0, [r0 + r1], 5 pinsrw xm0, [r0 + r1 * 2], 6 punpckhqdq xm0, xm1, xm0 vinserti128 m1, m1, xm0, 1 pshufb m0, m1, [interp_vert_shuf] pshufb m1, [interp_vert_shuf + 32] pmaddubsw m0, [r5] pmaddubsw m1, [r5 + 1 * mmsize] paddw m0, m1 %ifidn %1,pp mova m1, [pw_512] pmulhrsw m2, m1 pmulhrsw m0, m1 packuswb m2, m0 lea r4, [r3 * 3] pextrw [r2], xm2, 0 pextrw [r2 + r3], xm2, 1 pextrw [r2 + r3 * 2], xm2, 2 pextrw [r2 + r4], xm2, 3 vextracti128 xm0, m2, 1 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + r3 * 2], xm0, 2 pextrw [r2 + r4], xm0, 3 lea r2, [r2 + r3 * 4] pextrw [r2], xm2, 4 pextrw [r2 + r3], xm2, 5 pextrw [r2 + r3 * 2], xm2, 6 pextrw [r2 + r4], xm2, 7 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 4 pextrw [r2 + r3], xm0, 5 pextrw [r2 + r3 * 2], xm0, 6 pextrw [r2 + r4], xm0, 7 %else add r3d, r3d lea r4, [r3 * 3] vbroadcasti128 m1, [pw_2000] psubw m2, m1 psubw m0, m1 vextracti128 xm1, m2, 1 movd [r2], xm2 pextrd [r2 + r3], xm2, 1 pextrd [r2 + r3 * 2], xm2, 2 pextrd [r2 + r4], xm2, 3 lea r2, [r2 + r3 * 4] movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r4], xm1, 3 vextracti128 xm1, m0, 1 lea r2, [r2 + r3 * 4] movd [r2], xm0 pextrd [r2 + r3], xm0, 1 pextrd [r2 + r3 * 2], xm0, 2 pextrd [r2 + r4], xm0, 3 lea r2, [r2 + r3 * 4] movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r4], xm1, 3 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_2x16 pp FILTER_VER_CHROMA_AVX2_2x16 ps ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W2_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] mova m1, [pw_512] mov r4d, %2 lea r5, [3 * r1] .loop: movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 pmulhrsw m2, m1 movd m7, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 packuswb m2, m4 pextrw [r2], m2, 0 pextrw [r2 + r3], m2, 2 lea r2, [r2 + 2 * r3] pextrw [r2], m2, 4 pextrw [r2 + r3], m2, 6 lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop RET %endmacro FILTER_V4_W2_H4 2, 8 FILTER_V4_W2_H4 2, 16 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] lea r5, [r0 + 2 * r1] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r5] movd m5, [r5 + r1] punpcklbw m2, m3 punpcklbw m1, m4, m5 punpcklbw m2, m1 pmaddubsw m2, m0 movd m1, [r0 + 4 * r1] punpcklbw m3, m4 punpcklbw m5, m1 punpcklbw m3, m5 pmaddubsw m3, m0 phaddw m2, m3 pmulhrsw m2, [pw_512] packuswb m2, m2 movd [r2], m2 pextrd [r2 + r3], m2, 1 RET %macro FILTER_VER_CHROMA_AVX2_4x2 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 mov r4d, r4m shl r4d, 5 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff_V] add r5, r4 %else lea r5, [tab_ChromaCoeff_V + r4] %endif lea r4, [r1 * 3] movd xm1, [r0] movd xm2, [r0 + r1] punpcklbw xm1, xm2 movd xm3, [r0 + r1 * 2] punpcklbw xm2, xm3 movlhps xm1, xm2 movd xm0, [r0 + r4] punpcklbw xm3, xm0 movd xm2, [r0 + r1 * 4] punpcklbw xm0, xm2 movlhps xm3, xm0 vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] pmaddubsw m1, [r5] vextracti128 xm3, m1, 1 paddw xm1, xm3 %ifidn %1,pp pmulhrsw xm1, [pw_512] packuswb xm1, xm1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 %else add r3d, r3d psubw xm1, [pw_2000] movq [r2], xm1 movhps [r2 + r3], xm1 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_4x2 pp FILTER_VER_CHROMA_AVX2_4x2 ps ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] mova m1, [pw_512] lea r5, [r0 + 4 * r1] lea r4, [r1 * 3] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 movd m6, [r5] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 pmulhrsw m2, m1 movd m7, [r5 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r5 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 packuswb m2, m4 movd [r2], m2 pextrd [r2 + r3], m2, 1 lea r2, [r2 + 2 * r3] pextrd [r2], m2, 2 pextrd [r2 + r3], m2, 3 RET %macro FILTER_VER_CHROMA_AVX2_4x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 mov r4d, r4m shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] movd xm1, [r0] pinsrd xm1, [r0 + r1], 1 pinsrd xm1, [r0 + r1 * 2], 2 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] lea r0, [r0 + r1 * 4] movd xm2, [r0] pinsrd xm2, [r0 + r1], 1 pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] mova m2, [interp4_vpp_shuf1] vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] mova m2, [interp4_vpp_shuf1 + mmsize] vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] mova m2, [interp4_vpp_shuf] pshufb m0, m0, m2 pshufb m1, m1, m2 pmaddubsw m0, [r5] pmaddubsw m1, [r5 + mmsize] paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] %ifidn %1,pp pmulhrsw m0, [pw_512] vextracti128 xm1, m0, 1 packuswb xm0, xm1 lea r5, [r3 * 3] movd [r2], xm0 pextrd [r2 + r3], xm0, 1 pextrd [r2 + r3 * 2], xm0, 2 pextrd [r2 + r5], xm0, 3 %else add r3d, r3d psubw m0, [pw_2000] vextracti128 xm1, m0, 1 lea r5, [r3 * 3] movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm1 movhps [r2 + r5], xm1 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_4x4 pp FILTER_VER_CHROMA_AVX2_4x4 ps %macro FILTER_VER_CHROMA_AVX2_4x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 mov r4d, r4m shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] movd xm1, [r0] pinsrd xm1, [r0 + r1], 1 pinsrd xm1, [r0 + r1 * 2], 2 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] lea r0, [r0 + r1 * 4] movd xm2, [r0] pinsrd xm2, [r0 + r1], 1 pinsrd xm2, [r0 + r1 * 2], 2 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] lea r0, [r0 + r1 * 4] movd xm3, [r0] pinsrd xm3, [r0 + r1], 1 pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] mova m3, [interp4_vpp_shuf1] vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] mova m3, [interp4_vpp_shuf1 + mmsize] vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] mova m3, [interp4_vpp_shuf] pshufb m0, m0, m3 pshufb m1, m1, m3 pshufb m2, m2, m3 pshufb m4, m4, m3 pmaddubsw m0, [r5] pmaddubsw m4, [r5] pmaddubsw m1, [r5 + mmsize] pmaddubsw m2, [r5 + mmsize] paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] %ifidn %1,pp pmulhrsw m0, [pw_512] pmulhrsw m4, [pw_512] packuswb m0, m4 vextracti128 xm1, m0, 1 lea r5, [r3 * 3] movd [r2], xm0 pextrd [r2 + r3], xm0, 1 movd [r2 + r3 * 2], xm1 pextrd [r2 + r5], xm1, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm0, 3 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r5], xm1, 3 %else add r3d, r3d psubw m0, [pw_2000] psubw m4, [pw_2000] vextracti128 xm1, m0, 1 vextracti128 xm2, m4, 1 lea r5, [r3 * 3] movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm1 movhps [r2 + r5], xm1 lea r2, [r2 + r3 * 4] movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm2 movhps [r2 + r5], xm2 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_4x8 pp FILTER_VER_CHROMA_AVX2_4x8 ps %macro FILTER_VER_CHROMA_AVX2_4xN 2 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12 mov r4d, r4m shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] mova m10, [r5] mova m11, [r5 + mmsize] %ifidn %1,pp mova m9, [pw_512] %else add r3d, r3d mova m9, [pw_2000] %endif lea r5, [r3 * 3] %rep %2 / 16 movd xm1, [r0] pinsrd xm1, [r0 + r1], 1 pinsrd xm1, [r0 + r1 * 2], 2 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] lea r0, [r0 + r1 * 4] movd xm2, [r0] pinsrd xm2, [r0 + r1], 1 pinsrd xm2, [r0 + r1 * 2], 2 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] lea r0, [r0 + r1 * 4] movd xm3, [r0] pinsrd xm3, [r0 + r1], 1 pinsrd xm3, [r0 + r1 * 2], 2 pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] lea r0, [r0 + r1 * 4] movd xm4, [r0] pinsrd xm4, [r0 + r1], 1 pinsrd xm4, [r0 + r1 * 2], 2 pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] lea r0, [r0 + r1 * 4] movd xm5, [r0] pinsrd xm5, [r0 + r1], 1 pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] mova m5, [interp4_vpp_shuf1] vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] mova m5, [interp4_vpp_shuf1 + mmsize] vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] mova m5, [interp4_vpp_shuf] pshufb m0, m0, m5 pshufb m1, m1, m5 pshufb m2, m2, m5 pshufb m4, m4, m5 pshufb m3, m3, m5 pshufb m6, m6, m5 pshufb m7, m7, m5 pshufb m8, m8, m5 pmaddubsw m0, m10 pmaddubsw m6, m10 pmaddubsw m7, m10 pmaddubsw m8, m10 pmaddubsw m1, m11 pmaddubsw m2, m11 pmaddubsw m3, m11 pmaddubsw m4, m11 paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] %ifidn %1,pp pmulhrsw m0, m9 pmulhrsw m6, m9 pmulhrsw m7, m9 pmulhrsw m8, m9 packuswb m0, m6 packuswb m7, m8 vextracti128 xm1, m0, 1 vextracti128 xm2, m7, 1 movd [r2], xm0 pextrd [r2 + r3], xm0, 1 movd [r2 + r3 * 2], xm1 pextrd [r2 + r5], xm1, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm0, 3 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r5], xm1, 3 lea r2, [r2 + r3 * 4] movd [r2], xm7 pextrd [r2 + r3], xm7, 1 movd [r2 + r3 * 2], xm2 pextrd [r2 + r5], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm7, 2 pextrd [r2 + r3], xm7, 3 pextrd [r2 + r3 * 2], xm2, 2 pextrd [r2 + r5], xm2, 3 %else psubw m0, m9 psubw m6, m9 psubw m7, m9 psubw m8, m9 vextracti128 xm1, m0, 1 vextracti128 xm2, m6, 1 vextracti128 xm3, m7, 1 vextracti128 xm4, m8, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm1 movhps [r2 + r5], xm1 lea r2, [r2 + r3 * 4] movq [r2], xm6 movhps [r2 + r3], xm6 movq [r2 + r3 * 2], xm2 movhps [r2 + r5], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm7 movhps [r2 + r3], xm7 movq [r2 + r3 * 2], xm3 movhps [r2 + r5], xm3 lea r2, [r2 + r3 * 4] movq [r2], xm8 movhps [r2 + r3], xm8 movq [r2 + r3 * 2], xm4 movhps [r2 + r5], xm4 %endif lea r2, [r2 + r3 * 4] %endrep RET %endif %endmacro FILTER_VER_CHROMA_AVX2_4xN pp, 16 FILTER_VER_CHROMA_AVX2_4xN ps, 16 FILTER_VER_CHROMA_AVX2_4xN pp, 32 FILTER_VER_CHROMA_AVX2_4xN ps, 32 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W4_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] mova m1, [pw_512] mov r4d, %2 lea r5, [3 * r1] .loop: movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 pmulhrsw m2, m1 movd m7, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m7 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m7, m3 punpcklbw m5, m7 pmaddubsw m5, m0 phaddw m4, m5 pmulhrsw m4, m1 packuswb m2, m4 movd [r2], m2 pextrd [r2 + r3], m2, 1 lea r2, [r2 + 2 * r3] pextrd [r2], m2, 2 pextrd [r2 + r3], m2, 3 lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop RET %endmacro FILTER_V4_W4_H4 4, 8 FILTER_V4_W4_H4 4, 16 FILTER_V4_W4_H4 4, 32 %macro FILTER_V4_W8_H2 0 punpcklbw m1, m2 punpcklbw m7, m3, m0 pmaddubsw m1, m6 pmaddubsw m7, m5 paddw m1, m7 pmulhrsw m1, m4 packuswb m1, m1 %endmacro %macro FILTER_V4_W8_H3 0 punpcklbw m2, m3 punpcklbw m7, m0, m1 pmaddubsw m2, m6 pmaddubsw m7, m5 paddw m2, m7 pmulhrsw m2, m4 packuswb m2, m2 %endmacro %macro FILTER_V4_W8_H4 0 punpcklbw m3, m0 punpcklbw m7, m1, m2 pmaddubsw m3, m6 pmaddubsw m7, m5 paddw m3, m7 pmulhrsw m3, m4 packuswb m3, m3 %endmacro %macro FILTER_V4_W8_H5 0 punpcklbw m0, m1 punpcklbw m7, m2, m3 pmaddubsw m0, m6 pmaddubsw m7, m5 paddw m0, m7 pmulhrsw m0, m4 packuswb m0, m0 %endmacro %macro FILTER_V4_W8_8x2 2 FILTER_V4_W8 %1, %2 movq m0, [r0 + 4 * r1] FILTER_V4_W8_H2 movh [r2 + r3], m1 %endmacro %macro FILTER_V4_W8_8x4 2 FILTER_V4_W8_8x2 %1, %2 ;8x3 lea r6, [r0 + 4 * r1] movq m1, [r6 + r1] FILTER_V4_W8_H3 movh [r2 + 2 * r3], m2 ;8x4 movq m2, [r6 + 2 * r1] FILTER_V4_W8_H4 lea r5, [r2 + 2 * r3] movh [r5 + r3], m3 %endmacro %macro FILTER_V4_W8_8x6 2 FILTER_V4_W8_8x4 %1, %2 ;8x5 lea r6, [r6 + 2 * r1] movq m3, [r6 + r1] FILTER_V4_W8_H5 movh [r2 + 4 * r3], m0 ;8x6 movq m0, [r0 + 8 * r1] FILTER_V4_W8_H2 lea r5, [r2 + 4 * r3] movh [r5 + r3], m1 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W8 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 mov r4d, r4m sub r0, r1 movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] lea r5, [r0 + 2 * r1] movq m3, [r5 + r1] punpcklbw m0, m1 punpcklbw m4, m2, m3 %ifdef PIC lea r6, [tab_ChromaCoeff] movd m5, [r6 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pmaddubsw m0, m6 pshufb m5, [tab_Vm + 16] pmaddubsw m4, m5 paddw m0, m4 mova m4, [pw_512] pmulhrsw m0, m4 packuswb m0, m0 movh [r2], m0 %endmacro ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- FILTER_V4_W8_8x2 8, 2 RET ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- FILTER_V4_W8_8x4 8, 4 RET ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- FILTER_V4_W8_8x6 8, 6 RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] movd m2, [r0] movd m3, [r0 + r1] lea r5, [r0 + 2 * r1] movd m4, [r5] movd m5, [r5 + r1] punpcklbw m2, m3 punpcklbw m1, m4, m5 punpcklbw m2, m1 pmaddubsw m2, m0 movd m1, [r0 + 4 * r1] punpcklbw m3, m4 punpcklbw m5, m1 punpcklbw m3, m5 pmaddubsw m3, m0 phaddw m2, m3 psubw m2, [pw_2000] movh [r2], m2 movhps [r2 + r3], m2 RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] lea r4, [r1 * 3] lea r5, [r0 + 4 * r1] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r4] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 movd m6, [r5] punpcklbw m3, m4 punpcklbw m1, m5, m6 punpcklbw m3, m1 pmaddubsw m3, m0 phaddw m2, m3 mova m1, [pw_2000] psubw m2, m1 movh [r2], m2 movhps [r2 + r3], m2 movd m2, [r5 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m2 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r5 + 2 * r1] punpcklbw m5, m6 punpcklbw m2, m3 punpcklbw m5, m2 pmaddubsw m5, m0 phaddw m4, m5 psubw m4, m1 lea r2, [r2 + 2 * r3] movh [r2], m4 movhps [r2 + r3], m4 RET ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W4_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] mova m1, [pw_2000] mov r4d, %2/4 lea r5, [3 * r1] .loop: movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 psubw m2, m1 movh [r2], m2 movhps [r2 + r3], m2 movd m2, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m2 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m2, m3 punpcklbw m5, m2 pmaddubsw m5, m0 phaddw m4, m5 psubw m4, m1 lea r2, [r2 + 2 * r3] movh [r2], m4 movhps [r2 + r3], m4 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W4_H4 4, 8 FILTER_V_PS_W4_H4 4, 16 FILTER_V_PS_W4_H4 4, 32 ;-------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W8_H8_H16_H2 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_2000] mov r4d, %2/2 lea r5, [3 * r1] .loopH: movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m0, m6 pmaddubsw m2, m5 paddw m0, m2 psubw m0, m4 movu [r2], m0 movq m0, [r0 + 4 * r1] punpcklbw m3, m0 pmaddubsw m1, m6 pmaddubsw m3, m5 paddw m1, m3 psubw m1, m4 movu [r2 + r3], m1 lea r0, [r0 + 2 * r1] lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro FILTER_V_PS_W8_H8_H16_H2 8, 2 FILTER_V_PS_W8_H8_H16_H2 8, 4 FILTER_V_PS_W8_H8_H16_H2 8, 6 FILTER_V_PS_W8_H8_H16_H2 8, 12 FILTER_V_PS_W8_H8_H16_H2 8, 64 ;-------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W8_H8_H16_H32 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_2000] mov r4d, %2/4 lea r5, [3 * r1] .loop: movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m0, m6 pmaddubsw m7, m2, m5 paddw m0, m7 psubw m0, m4 movu [r2], m0 lea r0, [r0 + 4 * r1] movq m0, [r0] punpcklbw m3, m0 pmaddubsw m1, m6 pmaddubsw m7, m3, m5 paddw m1, m7 psubw m1, m4 movu [r2 + r3], m1 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m2, m6 pmaddubsw m0, m5 paddw m2, m0 psubw m2, m4 lea r2, [r2 + 2 * r3] movu [r2], m2 movq m2, [r0 + 2 * r1] punpcklbw m1, m2 pmaddubsw m3, m6 pmaddubsw m1, m5 paddw m3, m1 psubw m3, m4 movu [r2 + r3], m3 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W8_H8_H16_H32 8, 8 FILTER_V_PS_W8_H8_H16_H32 8, 16 FILTER_V_PS_W8_H8_H16_H32 8, 32 ;------------------------------------------------------------------------------------------------------------ ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------ %macro FILTER_V_PS_W6 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_2000] lea r5, [3 * r1] mov r4d, %2/4 .loop: movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m0, m6 pmaddubsw m7, m2, m5 paddw m0, m7 psubw m0, m4 movh [r2], m0 pshufd m0, m0, 2 movd [r2 + 8], m0 lea r0, [r0 + 4 * r1] movq m0, [r0] punpcklbw m3, m0 pmaddubsw m1, m6 pmaddubsw m7, m3, m5 paddw m1, m7 psubw m1, m4 movh [r2 + r3], m1 pshufd m1, m1, 2 movd [r2 + r3 + 8], m1 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m2, m6 pmaddubsw m0, m5 paddw m2, m0 psubw m2, m4 lea r2,[r2 + 2 * r3] movh [r2], m2 pshufd m2, m2, 2 movd [r2 + 8], m2 movq m2,[r0 + 2 * r1] punpcklbw m1, m2 pmaddubsw m3, m6 pmaddubsw m1, m5 paddw m3, m1 psubw m3, m4 movh [r2 + r3], m3 pshufd m3, m3, 2 movd [r2 + r3 + 8], m3 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W6 6, 8 FILTER_V_PS_W6 6, 16 ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W12 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m7, [r0 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_2000] psubw m4, m6 psubw m2, m6 movu [r2], m4 movh [r2 + 16], m2 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m2, [r0 + 2 * r1] punpcklbw m5, m7, m2 punpckhbw m7, m2 pmaddubsw m5, m0 pmaddubsw m7, m0 paddw m4, m5 paddw m3, m7 psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movh [r2 + r3 + 16], m3 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W12 12, 16 FILTER_V_PS_W12 12, 32 ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W16 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m7, [r0 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_2000] psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m5, [r0 + 2 * r1] punpcklbw m2, m7, m5 punpckhbw m7, m5 pmaddubsw m2, m0 pmaddubsw m7, m0 paddw m4, m2 paddw m3, m7 psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W16 16, 4 FILTER_V_PS_W16 16, 8 FILTER_V_PS_W16 16, 12 FILTER_V_PS_W16 16, 16 FILTER_V_PS_W16 16, 32 FILTER_V_PS_W16 16, 24 FILTER_V_PS_W16 16, 64 ;-------------------------------------------------------------------------------------------------------------- ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_V4_PS_W24 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m7, [r5 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_2000] psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m2, [r5 + 2 * r1] punpcklbw m5, m7, m2 punpckhbw m7, m2 pmaddubsw m5, m0 pmaddubsw m7, m0 paddw m4, m5 paddw m3, m7 psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 movq m2, [r0 + 16] movq m3, [r0 + r1 + 16] movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] punpcklbw m2, m3 punpcklbw m7, m4, m5 pmaddubsw m2, m1 pmaddubsw m7, m0 paddw m2, m7 psubw m2, m6 movu [r2 + 32], m2 movq m2, [r5 + 2 * r1 + 16] punpcklbw m3, m4 punpcklbw m5, m2 pmaddubsw m3, m1 pmaddubsw m5, m0 paddw m3, m5 psubw m3, m6 movu [r2 + r3 + 32], m3 mov r0, r5 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V4_PS_W24 24, 32 FILTER_V4_PS_W24 24, 64 ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W32 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mova m7, [pw_2000] mov r4d, %2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m3, [r5] movu m5, [r5 + r1] punpcklbw m6, m3, m5 punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 paddw m4, m6 paddw m2, m3 psubw m4, m7 psubw m2, m7 movu [r2], m4 movu [r2 + 16], m2 movu m2, [r0 + 16] movu m3, [r0 + r1 + 16] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 movu m3, [r5 + 16] movu m5, [r5 + r1 + 16] punpcklbw m6, m3, m5 punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 paddw m4, m6 paddw m2, m3 psubw m4, m7 psubw m2, m7 movu [r2 + 32], m4 movu [r2 + 48], m2 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W32 32, 8 FILTER_V_PS_W32 32, 16 FILTER_V_PS_W32 32, 24 FILTER_V_PS_W32 32, 32 FILTER_V_PS_W32 32, 48 FILTER_V_PS_W32 32, 64 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W8_H8_H16_H32 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_512] lea r5, [r1 * 3] mov r4d, %2 .loop: movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m0, m6 pmaddubsw m7, m2, m5 paddw m0, m7 pmulhrsw m0, m4 packuswb m0, m0 movh [r2], m0 lea r0, [r0 + 4 * r1] movq m0, [r0] punpcklbw m3, m0 pmaddubsw m1, m6 pmaddubsw m7, m3, m5 paddw m1, m7 pmulhrsw m1, m4 packuswb m1, m1 movh [r2 + r3], m1 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m2, m6 pmaddubsw m0, m5 paddw m2, m0 pmulhrsw m2, m4 movq m7, [r0 + 2 * r1] punpcklbw m1, m7 pmaddubsw m3, m6 pmaddubsw m1, m5 paddw m3, m1 pmulhrsw m3, m4 packuswb m2, m3 lea r2, [r2 + 2 * r3] movh [r2], m2 movhps [r2 + r3], m2 lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop RET %endmacro FILTER_V4_W8_H8_H16_H32 8, 8 FILTER_V4_W8_H8_H16_H32 8, 16 FILTER_V4_W8_H8_H16_H32 8, 32 FILTER_V4_W8_H8_H16_H32 8, 12 FILTER_V4_W8_H8_H16_H32 8, 64 %macro PROCESS_CHROMA_AVX2_W8_8R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] movq xm3, [r0 + r1] ; m3 = row 9 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] pmaddubsw m0, [r5 + 1 * mmsize] paddw m4, m0 %endmacro %macro FILTER_VER_CHROMA_AVX2_8x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 PROCESS_CHROMA_AVX2_W8_8R %ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m5, m3 ; m5 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 pmulhrsw m1, m3 ; m1 = word: row 4, row 5 pmulhrsw m4, m3 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r4], xm4 %else add r3d, r3d vbroadcasti128 m3, [pw_2000] lea r4, [r3 * 3] psubw m5, m3 ; m5 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 psubw m1, m3 ; m1 = word: row 4, row 5 psubw m4, m3 ; m4 = word: row 6, row 7 vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movu [r2], xm5 movu [r2 + r3], xm6 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm4 vextracti128 xm4, m4, 1 movu [r2 + r4], xm4 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_8x8 pp FILTER_VER_CHROMA_AVX2_8x8 ps %macro FILTER_VER_CHROMA_AVX2_8x6 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] pmaddubsw m4, [r5 + 1 * mmsize] paddw m1, m4 %ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m5, m3 ; m5 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 pmulhrsw m1, m3 ; m1 = word: row 4, row 5 packuswb m5, m2 packuswb m1, m1 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 %else add r3d, r3d mova m3, [pw_2000] lea r4, [r3 * 3] psubw m5, m3 ; m5 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 psubw m1, m3 ; m1 = word: row 4, row 5 vextracti128 xm4, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movu [r2], xm5 movu [r2 + r3], xm4 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm1 movu [r2 + r3], xm0 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_8x6 pp FILTER_VER_CHROMA_AVX2_8x6 ps %macro PROCESS_CHROMA_AVX2_W8_16R 1 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 vinserti128 m5, m1, xm2, 1 pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 vinserti128 m2, m3, xm4, 1 pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 vinserti128 m4, m4, xm3, 1 pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] movq xm3, [r0 + r1] ; m3 = row 9 punpcklbw xm0, xm3 movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 vinserti128 m0, m0, xm3, 1 pmaddubsw m3, m0, [r5 + 1 * mmsize] paddw m4, m3 pmaddubsw m0, [r5] %ifidn %1,pp pmulhrsw m5, m7 ; m5 = word: row 0, row 1 pmulhrsw m2, m7 ; m2 = word: row 2, row 3 pmulhrsw m1, m7 ; m1 = word: row 4, row 5 pmulhrsw m4, m7 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r6], xm4 %else psubw m5, m7 ; m5 = word: row 0, row 1 psubw m2, m7 ; m2 = word: row 2, row 3 psubw m1, m7 ; m1 = word: row 4, row 5 psubw m4, m7 ; m4 = word: row 6, row 7 vextracti128 xm3, m5, 1 movu [r2], xm5 movu [r2 + r3], xm3 vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] vextracti128 xm5, m1, 1 vextracti128 xm3, m4, 1 movu [r2], xm1 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm4 movu [r2 + r6], xm3 %endif movq xm3, [r0 + r4] ; m3 = row 11 punpcklbw xm6, xm3 lea r0, [r0 + r1 * 4] movq xm5, [r0] ; m5 = row 12 punpcklbw xm3, xm5 vinserti128 m6, m6, xm3, 1 pmaddubsw m3, m6, [r5 + 1 * mmsize] paddw m0, m3 pmaddubsw m6, [r5] movq xm3, [r0 + r1] ; m3 = row 13 punpcklbw xm5, xm3 movq xm2, [r0 + r1 * 2] ; m2 = row 14 punpcklbw xm3, xm2 vinserti128 m5, m5, xm3, 1 pmaddubsw m3, m5, [r5 + 1 * mmsize] paddw m6, m3 pmaddubsw m5, [r5] movq xm3, [r0 + r4] ; m3 = row 15 punpcklbw xm2, xm3 lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 16 punpcklbw xm3, xm1 vinserti128 m2, m2, xm3, 1 pmaddubsw m3, m2, [r5 + 1 * mmsize] paddw m5, m3 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 17 punpcklbw xm1, xm3 movq xm4, [r0 + r1 * 2] ; m4 = row 18 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5 + 1 * mmsize] paddw m2, m1 lea r2, [r2 + r3 * 4] %ifidn %1,pp pmulhrsw m0, m7 ; m0 = word: row 8, row 9 pmulhrsw m6, m7 ; m6 = word: row 10, row 11 pmulhrsw m5, m7 ; m5 = word: row 12, row 13 pmulhrsw m2, m7 ; m2 = word: row 14, row 15 packuswb m0, m6 packuswb m5, m2 vextracti128 xm6, m0, 1 vextracti128 xm2, m5, 1 movq [r2], xm0 movq [r2 + r3], xm6 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm6 lea r2, [r2 + r3 * 4] movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 %else psubw m0, m7 ; m0 = word: row 8, row 9 psubw m6, m7 ; m6 = word: row 10, row 11 psubw m5, m7 ; m5 = word: row 12, row 13 psubw m2, m7 ; m2 = word: row 14, row 15 vextracti128 xm1, m0, 1 vextracti128 xm3, m6, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] vextracti128 xm1, m5, 1 vextracti128 xm3, m2, 1 movu [r2], xm5 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif %endmacro %macro FILTER_VER_CHROMA_AVX2_8x16 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d mova m7, [pw_2000] %endif lea r6, [r3 * 3] PROCESS_CHROMA_AVX2_W8_16R %1 RET %endmacro FILTER_VER_CHROMA_AVX2_8x16 pp FILTER_VER_CHROMA_AVX2_8x16 ps %macro FILTER_VER_CHROMA_AVX2_8x12 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x12, 4, 7, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1, pp mova m7, [pw_512] %else add r3d, r3d mova m7, [pw_2000] %endif lea r6, [r3 * 3] movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 vinserti128 m5, m1, xm2, 1 pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 vinserti128 m2, m3, xm4, 1 pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 vinserti128 m4, m4, xm3, 1 pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] movq xm3, [r0 + r1] ; m3 = row 9 punpcklbw xm0, xm3 movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 vinserti128 m0, m0, xm3, 1 pmaddubsw m3, m0, [r5 + 1 * mmsize] paddw m4, m3 pmaddubsw m0, [r5] %ifidn %1, pp pmulhrsw m5, m7 ; m5 = word: row 0, row 1 pmulhrsw m2, m7 ; m2 = word: row 2, row 3 pmulhrsw m1, m7 ; m1 = word: row 4, row 5 pmulhrsw m4, m7 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r6], xm4 %else psubw m5, m7 ; m5 = word: row 0, row 1 psubw m2, m7 ; m2 = word: row 2, row 3 psubw m1, m7 ; m1 = word: row 4, row 5 psubw m4, m7 ; m4 = word: row 6, row 7 vextracti128 xm3, m5, 1 movu [r2], xm5 movu [r2 + r3], xm3 vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] vextracti128 xm5, m1, 1 vextracti128 xm3, m4, 1 movu [r2], xm1 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm4 movu [r2 + r6], xm3 %endif movq xm3, [r0 + r4] ; m3 = row 11 punpcklbw xm6, xm3 lea r0, [r0 + r1 * 4] movq xm5, [r0] ; m5 = row 12 punpcklbw xm3, xm5 vinserti128 m6, m6, xm3, 1 pmaddubsw m3, m6, [r5 + 1 * mmsize] paddw m0, m3 pmaddubsw m6, [r5] movq xm3, [r0 + r1] ; m3 = row 13 punpcklbw xm5, xm3 movq xm2, [r0 + r1 * 2] ; m2 = row 14 punpcklbw xm3, xm2 vinserti128 m5, m5, xm3, 1 pmaddubsw m3, m5, [r5 + 1 * mmsize] paddw m6, m3 lea r2, [r2 + r3 * 4] %ifidn %1, pp pmulhrsw m0, m7 ; m0 = word: row 8, row 9 pmulhrsw m6, m7 ; m6 = word: row 10, row 11 packuswb m0, m6 vextracti128 xm6, m0, 1 movq [r2], xm0 movq [r2 + r3], xm6 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm6 %else psubw m0, m7 ; m0 = word: row 8, row 9 psubw m6, m7 ; m6 = word: row 10, row 11 vextracti128 xm1, m0, 1 vextracti128 xm3, m6, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm3 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_8x12 pp FILTER_VER_CHROMA_AVX2_8x12 ps %macro FILTER_VER_CHROMA_AVX2_8xN 2 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d mova m7, [pw_2000] %endif lea r6, [r3 * 3] %rep %2 / 16 PROCESS_CHROMA_AVX2_W8_16R %1 lea r2, [r2 + r3 * 4] %endrep RET %endmacro FILTER_VER_CHROMA_AVX2_8xN pp, 32 FILTER_VER_CHROMA_AVX2_8xN ps, 32 FILTER_VER_CHROMA_AVX2_8xN pp, 64 FILTER_VER_CHROMA_AVX2_8xN ps, 64 %macro PROCESS_CHROMA_AVX2_W8_4R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m0, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] pmaddubsw m1, [r5 + 1 * mmsize] paddw m2, m1 %endmacro %macro FILTER_VER_CHROMA_AVX2_8x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 PROCESS_CHROMA_AVX2_W8_4R %ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m0, m3 ; m0 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 packuswb m0, m2 vextracti128 xm2, m0, 1 movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r4], xm2 %else add r3d, r3d vbroadcasti128 m3, [pw_2000] lea r4, [r3 * 3] psubw m0, m3 ; m0 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 vextracti128 xm1, m0, 1 vextracti128 xm4, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm4 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_8x4 pp FILTER_VER_CHROMA_AVX2_8x4 ps %macro FILTER_VER_CHROMA_AVX2_8x2 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m1, [r5] movq xm2, [r0 + r4] ; m2 = row 3 punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] movq xm0, [r0 + r1 * 4] ; m0 = row 4 punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 %ifidn %1,pp pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 packuswb m1, m1 vextracti128 xm0, m1, 1 movq [r2], xm1 movq [r2 + r3], xm0 %else add r3d, r3d psubw m1, [pw_2000] ; m1 = word: row 0, row 1 vextracti128 xm0, m1, 1 movu [r2], xm1 movu [r2 + r3], xm0 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_8x2 pp FILTER_VER_CHROMA_AVX2_8x2 ps %macro FILTER_VER_CHROMA_AVX2_6x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 PROCESS_CHROMA_AVX2_W8_8R %ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m5, m3 ; m5 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 pmulhrsw m1, m3 ; m1 = word: row 4, row 5 pmulhrsw m4, m3 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movd [r2], xm5 pextrw [r2 + 4], xm5, 2 movd [r2 + r3], xm2 pextrw [r2 + r3 + 4], xm2, 2 pextrd [r2 + r3 * 2], xm5, 2 pextrw [r2 + r3 * 2 + 4], xm5, 6 pextrd [r2 + r4], xm2, 2 pextrw [r2 + r4 + 4], xm2, 6 lea r2, [r2 + r3 * 4] movd [r2], xm1 pextrw [r2 + 4], xm1, 2 movd [r2 + r3], xm4 pextrw [r2 + r3 + 4], xm4, 2 pextrd [r2 + r3 * 2], xm1, 2 pextrw [r2 + r3 * 2 + 4], xm1, 6 pextrd [r2 + r4], xm4, 2 pextrw [r2 + r4 + 4], xm4, 6 %else add r3d, r3d vbroadcasti128 m3, [pw_2000] lea r4, [r3 * 3] psubw m5, m3 ; m5 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 psubw m1, m3 ; m1 = word: row 4, row 5 psubw m4, m3 ; m4 = word: row 6, row 7 vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movq [r2], xm5 pextrd [r2 + 8], xm5, 2 movq [r2 + r3], xm6 pextrd [r2 + r3 + 8], xm6, 2 movq [r2 + r3 * 2], xm2 pextrd [r2 + r3 * 2 + 8], xm2, 2 movq [r2 + r4], xm3 pextrd [r2 + r4 + 8], xm3, 2 lea r2, [r2 + r3 * 4] movq [r2], xm1 pextrd [r2 + 8], xm1, 2 movq [r2 + r3], xm0 pextrd [r2 + r3 + 8], xm0, 2 movq [r2 + r3 * 2], xm4 pextrd [r2 + r3 * 2 + 8], xm4, 2 vextracti128 xm4, m4, 1 movq [r2 + r4], xm4 pextrd [r2 + r4 + 8], xm4, 2 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_6x8 pp FILTER_VER_CHROMA_AVX2_6x8 ps ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W6_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m5, [r5 + r4 * 4] %else movd m5, [tab_ChromaCoeff + r4 * 4] %endif pshufb m6, m5, [tab_Vm] pshufb m5, [tab_Vm + 16] mova m4, [pw_512] mov r4d, %2 lea r5, [3 * r1] .loop: movq m0, [r0] movq m1, [r0 + r1] movq m2, [r0 + 2 * r1] movq m3, [r0 + r5] punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 pmaddubsw m0, m6 pmaddubsw m7, m2, m5 paddw m0, m7 pmulhrsw m0, m4 packuswb m0, m0 movd [r2], m0 pextrw [r2 + 4], m0, 2 lea r0, [r0 + 4 * r1] movq m0, [r0] punpcklbw m3, m0 pmaddubsw m1, m6 pmaddubsw m7, m3, m5 paddw m1, m7 pmulhrsw m1, m4 packuswb m1, m1 movd [r2 + r3], m1 pextrw [r2 + r3 + 4], m1, 2 movq m1, [r0 + r1] punpcklbw m7, m0, m1 pmaddubsw m2, m6 pmaddubsw m7, m5 paddw m2, m7 pmulhrsw m2, m4 packuswb m2, m2 lea r2, [r2 + 2 * r3] movd [r2], m2 pextrw [r2 + 4], m2, 2 movq m2, [r0 + 2 * r1] punpcklbw m1, m2 pmaddubsw m3, m6 pmaddubsw m1, m5 paddw m3, m1 pmulhrsw m3, m4 packuswb m3, m3 movd [r2 + r3], m3 pextrw [r2 + r3 + 4], m3, 2 lea r2, [r2 + 2 * r3] sub r4, 4 jnz .loop RET %endmacro FILTER_V4_W6_H4 6, 8 FILTER_V4_W6_H4 6, 16 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W12_H2 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m7, [r0 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_512] pmulhrsw m4, m6 pmulhrsw m2, m6 packuswb m4, m2 movh [r2], m4 pextrd [r2 + 8], m4, 2 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m5, [r0 + 2 * r1] punpcklbw m2, m7, m5 punpckhbw m7, m5 pmaddubsw m2, m0 pmaddubsw m7, m0 paddw m4, m2 paddw m3, m7 pmulhrsw m4, m6 pmulhrsw m3, m6 packuswb m4, m3 movh [r2 + r3], m4 pextrd [r2 + r3 + 8], m4, 2 lea r2, [r2 + 2 * r3] sub r4, 2 jnz .loop RET %endmacro FILTER_V4_W12_H2 12, 16 FILTER_V4_W12_H2 12, 32 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W16_H2 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r0, [r0 + 2 * r1] movu m5, [r0] movu m6, [r0 + r1] punpckhbw m7, m5, m6 pmaddubsw m7, m0 paddw m2, m7 punpcklbw m7, m5, m6 pmaddubsw m7, m0 paddw m4, m7 mova m7, [pw_512] pmulhrsw m4, m7 pmulhrsw m2, m7 packuswb m4, m2 movu [r2], m4 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m5, [r0 + 2 * r1] punpcklbw m2, m6, m5 punpckhbw m6, m5 pmaddubsw m2, m0 pmaddubsw m6, m0 paddw m4, m2 paddw m3, m6 pmulhrsw m4, m7 pmulhrsw m3, m7 packuswb m4, m3 movu [r2 + r3], m4 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V4_W16_H2 16, 4 FILTER_V4_W16_H2 16, 8 FILTER_V4_W16_H2 16, 12 FILTER_V4_W16_H2 16, 16 FILTER_V4_W16_H2 16, 32 FILTER_V4_W16_H2 16, 24 FILTER_V4_W16_H2 16, 64 %macro FILTER_VER_CHROMA_AVX2_16x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m12, [r5] mova m13, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif lea r5, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, m12 movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, m12 movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, m13 paddw m0, m4 pmaddubsw m2, m12 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, m13 paddw m1, m5 pmaddubsw m3, m12 movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, m13 paddw m2, m6 pmaddubsw m4, m12 movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, m13 paddw m3, m7 pmaddubsw m5, m12 movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, m13 paddw m4, m8 pmaddubsw m6, m12 lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, m13 paddw m5, m9 pmaddubsw m7, m12 movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, m13 paddw m6, m10 pmaddubsw m8, m12 movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, m13 paddw m7, m11 pmaddubsw m9, m12 %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r5], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r5], xm7 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r5], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 movu [r2 + r3 * 2], m6 movu [r2 + r5], m7 %endif lea r2, [r2 + r3 * 4] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm6, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm6, 1 pmaddubsw m6, m10, m13 paddw m8, m6 pmaddubsw m10, m12 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 12 punpckhbw xm7, xm11, xm6 punpcklbw xm11, xm6 vinserti128 m11, m11, xm7, 1 pmaddubsw m7, m11, m13 paddw m9, m7 pmaddubsw m11, m12 movu xm7, [r0 + r1] ; m7 = row 13 punpckhbw xm0, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm0, 1 pmaddubsw m0, m6, m13 paddw m10, m0 pmaddubsw m6, m12 movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm7, xm0 punpcklbw xm7, xm0 vinserti128 m7, m7, xm1, 1 pmaddubsw m1, m7, m13 paddw m11, m1 pmaddubsw m7, m12 movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, m13 paddw m6, m2 pmaddubsw m0, m12 lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, m13 paddw m7, m3 pmaddubsw m1, m12 movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m2, m13 paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m3, m13 paddw m1, m3 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m6, m14 ; m6 = word: row 12 pmulhrsw m7, m14 ; m7 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m6, m7 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m6, m6, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm7, m6, 1 vextracti128 xm1, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r5], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm6 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm0 movu [r2 + r5], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m6, m14 ; m6 = word: row 12 psubw m7, m14 ; m7 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 movu [r2 + r5], m11 lea r2, [r2 + r3 * 4] movu [r2], m6 movu [r2 + r3], m7 movu [r2 + r3 * 2], m0 movu [r2 + r5], m1 %endif RET %endif %endmacro FILTER_VER_CHROMA_AVX2_16x16 pp FILTER_VER_CHROMA_AVX2_16x16 ps %macro FILTER_VER_CHROMA_AVX2_16x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m6, [pw_512] %else add r3d, r3d mova m6, [pw_2000] %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + mmsize] paddw m1, m5 pmaddubsw m3, [r5] %ifidn %1,pp pmulhrsw m0, m6 ; m0 = word: row 0 pmulhrsw m1, m6 ; m1 = word: row 1 packuswb m0, m1 vpermq m0, m0, 11011000b vextracti128 xm1, m0, 1 movu [r2], xm0 movu [r2 + r3], xm1 %else psubw m0, m6 ; m0 = word: row 0 psubw m1, m6 ; m1 = word: row 1 movu [r2], m0 movu [r2 + r3], m1 %endif movu xm0, [r0 + r1] ; m0 = row 5 punpckhbw xm1, xm4, xm0 punpcklbw xm4, xm0 vinserti128 m4, m4, xm1, 1 pmaddubsw m1, m4, [r5 + mmsize] paddw m2, m1 pmaddubsw m4, [r5] movu xm1, [r0 + r1 * 2] ; m1 = row 6 punpckhbw xm5, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm5, 1 pmaddubsw m5, m0, [r5 + mmsize] paddw m3, m5 pmaddubsw m0, [r5] %ifidn %1,pp pmulhrsw m2, m6 ; m2 = word: row 2 pmulhrsw m3, m6 ; m3 = word: row 3 packuswb m2, m3 vpermq m2, m2, 11011000b vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %else psubw m2, m6 ; m2 = word: row 2 psubw m3, m6 ; m3 = word: row 3 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 %endif movu xm2, [r0 + r4] ; m2 = row 7 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + mmsize] paddw m4, m3 pmaddubsw m1, [r5] lea r0, [r0 + r1 * 4] movu xm3, [r0] ; m3 = row 8 punpckhbw xm5, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm5, 1 pmaddubsw m5, m2, [r5 + mmsize] paddw m0, m5 pmaddubsw m2, [r5] lea r2, [r2 + r3 * 4] %ifidn %1,pp pmulhrsw m4, m6 ; m4 = word: row 4 pmulhrsw m0, m6 ; m0 = word: row 5 packuswb m4, m0 vpermq m4, m4, 11011000b vextracti128 xm0, m4, 1 movu [r2], xm4 movu [r2 + r3], xm0 %else psubw m4, m6 ; m4 = word: row 4 psubw m0, m6 ; m0 = word: row 5 movu [r2], m4 movu [r2 + r3], m0 %endif movu xm5, [r0 + r1] ; m5 = row 9 punpckhbw xm4, xm3, xm5 punpcklbw xm3, xm5 vinserti128 m3, m3, xm4, 1 pmaddubsw m3, [r5 + mmsize] paddw m1, m3 movu xm4, [r0 + r1 * 2] ; m4 = row 10 punpckhbw xm0, xm5, xm4 punpcklbw xm5, xm4 vinserti128 m5, m5, xm0, 1 pmaddubsw m5, [r5 + mmsize] paddw m2, m5 %ifidn %1,pp pmulhrsw m1, m6 ; m1 = word: row 6 pmulhrsw m2, m6 ; m2 = word: row 7 packuswb m1, m2 vpermq m1, m1, 11011000b vextracti128 xm2, m1, 1 movu [r2 + r3 * 2], xm1 movu [r2 + r6], xm2 %else psubw m1, m6 ; m1 = word: row 6 psubw m2, m6 ; m2 = word: row 7 movu [r2 + r3 * 2], m1 movu [r2 + r6], m2 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_16x8 pp FILTER_VER_CHROMA_AVX2_16x8 ps %macro FILTER_VER_CHROMA_AVX2_16x12 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m8, [r5] mova m9, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d vbroadcasti128 m7, [pw_2000] %endif lea r5, [r3 * 3] movu xm0, [r0] vinserti128 m0, m0, [r0 + r1 * 2], 1 movu xm1, [r0 + r1] vinserti128 m1, m1, [r0 + r4], 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vperm2i128 m4, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 pmaddubsw m4, m8 pmaddubsw m3, m2, m9 paddw m4, m3 pmaddubsw m2, m8 vextracti128 xm0, m0, 1 lea r0, [r0 + r1 * 4] vinserti128 m0, m0, [r0], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 vperm2i128 m6, m5, m3, 0x20 vperm2i128 m5, m5, m3, 0x31 pmaddubsw m6, m8 pmaddubsw m3, m5, m9 paddw m6, m3 pmaddubsw m5, m8 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 0 pmulhrsw m6, m7 ; m6 = word: row 1 packuswb m4, m6 vpermq m4, m4, 11011000b vextracti128 xm6, m4, 1 movu [r2], xm4 movu [r2 + r3], xm6 %else psubw m4, m7 ; m4 = word: row 0 psubw m6, m7 ; m6 = word: row 1 movu [r2], m4 movu [r2 + r3], m6 %endif movu xm4, [r0 + r1 * 2] vinserti128 m4, m4, [r0 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 punpcklbw m6, m0, m4 punpckhbw m1, m0, m4 vperm2i128 m0, m6, m1, 0x20 vperm2i128 m6, m6, m1, 0x31 pmaddubsw m1, m0, m9 paddw m5, m1 pmaddubsw m0, m8 pmaddubsw m1, m6, m9 paddw m2, m1 pmaddubsw m6, m8 %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 2 pmulhrsw m5, m7 ; m5 = word: row 3 packuswb m2, m5 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r5], xm5 %else psubw m2, m7 ; m2 = word: row 2 psubw m5, m7 ; m5 = word: row 3 movu [r2 + r3 * 2], m2 movu [r2 + r5], m5 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] lea r0, [r0 + r1 * 4] vinserti128 m1, m1, [r0], 1 vinserti128 m4, m4, xm1, 1 punpcklbw m2, m4, m1 punpckhbw m5, m4, m1 vperm2i128 m3, m2, m5, 0x20 vperm2i128 m2, m2, m5, 0x31 pmaddubsw m5, m3, m9 paddw m6, m5 pmaddubsw m3, m8 pmaddubsw m5, m2, m9 paddw m0, m5 pmaddubsw m2, m8 %ifidn %1,pp pmulhrsw m6, m7 ; m6 = word: row 4 pmulhrsw m0, m7 ; m0 = word: row 5 packuswb m6, m0 vpermq m6, m6, 11011000b vextracti128 xm0, m6, 1 movu [r2], xm6 movu [r2 + r3], xm0 %else psubw m6, m7 ; m6 = word: row 4 psubw m0, m7 ; m0 = word: row 5 movu [r2], m6 movu [r2 + r3], m0 %endif movu xm6, [r0 + r1 * 2] vinserti128 m6, m6, [r0 + r1], 1 vextracti128 xm0, m6, 1 vinserti128 m1, m1, xm0, 0 punpcklbw m4, m1, m6 punpckhbw m5, m1, m6 vperm2i128 m0, m4, m5, 0x20 vperm2i128 m5, m4, m5, 0x31 pmaddubsw m4, m0, m9 paddw m2, m4 pmaddubsw m0, m8 pmaddubsw m4, m5, m9 paddw m3, m4 pmaddubsw m5, m8 %ifidn %1,pp pmulhrsw m3, m7 ; m3 = word: row 6 pmulhrsw m2, m7 ; m2 = word: row 7 packuswb m3, m2 vpermq m3, m3, 11011000b vextracti128 xm2, m3, 1 movu [r2 + r3 * 2], xm3 movu [r2 + r5], xm2 %else psubw m3, m7 ; m3 = word: row 6 psubw m2, m7 ; m2 = word: row 7 movu [r2 + r3 * 2], m3 movu [r2 + r5], m2 %endif lea r2, [r2 + r3 * 4] movu xm3, [r0 + r4] lea r0, [r0 + r1 * 4] vinserti128 m3, m3, [r0], 1 vinserti128 m6, m6, xm3, 1 punpcklbw m2, m6, m3 punpckhbw m1, m6, m3 vperm2i128 m4, m2, m1, 0x20 vperm2i128 m2, m2, m1, 0x31 pmaddubsw m1, m4, m9 paddw m5, m1 pmaddubsw m4, m8 pmaddubsw m1, m2, m9 paddw m0, m1 pmaddubsw m2, m8 %ifidn %1,pp pmulhrsw m5, m7 ; m5 = word: row 8 pmulhrsw m0, m7 ; m0 = word: row 9 packuswb m5, m0 vpermq m5, m5, 11011000b vextracti128 xm0, m5, 1 movu [r2], xm5 movu [r2 + r3], xm0 %else psubw m5, m7 ; m5 = word: row 8 psubw m0, m7 ; m0 = word: row 9 movu [r2], m5 movu [r2 + r3], m0 %endif movu xm5, [r0 + r1 * 2] vinserti128 m5, m5, [r0 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 punpcklbw m1, m3, m5 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m1, m6, m9 paddw m2, m1 pmaddubsw m1, m0, m9 paddw m4, m1 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 10 pmulhrsw m2, m7 ; m2 = word: row 11 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 movu [r2 + r3 * 2], xm4 movu [r2 + r5], xm2 %else psubw m4, m7 ; m4 = word: row 10 psubw m2, m7 ; m2 = word: row 11 movu [r2 + r3 * 2], m4 movu [r2 + r5], m2 %endif RET %endif %endmacro FILTER_VER_CHROMA_AVX2_16x12 pp FILTER_VER_CHROMA_AVX2_16x12 ps %macro FILTER_VER_CHROMA_AVX2_16xN 2 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d mova m7, [pw_2000] %endif lea r6, [r3 * 3] mov r7d, %2 / 16 .loopH: movu xm0, [r0] vinserti128 m0, m0, [r0 + r1 * 2], 1 movu xm1, [r0 + r1] vinserti128 m1, m1, [r0 + r4], 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vperm2i128 m4, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 pmaddubsw m4, [r5] pmaddubsw m3, m2, [r5 + mmsize] paddw m4, m3 pmaddubsw m2, [r5] vextracti128 xm0, m0, 1 lea r0, [r0 + r1 * 4] vinserti128 m0, m0, [r0], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 vperm2i128 m6, m5, m3, 0x20 vperm2i128 m5, m5, m3, 0x31 pmaddubsw m6, [r5] pmaddubsw m3, m5, [r5 + mmsize] paddw m6, m3 pmaddubsw m5, [r5] %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 0 pmulhrsw m6, m7 ; m6 = word: row 1 packuswb m4, m6 vpermq m4, m4, 11011000b vextracti128 xm6, m4, 1 movu [r2], xm4 movu [r2 + r3], xm6 %else psubw m4, m7 ; m4 = word: row 0 psubw m6, m7 ; m6 = word: row 1 movu [r2], m4 movu [r2 + r3], m6 %endif movu xm4, [r0 + r1 * 2] vinserti128 m4, m4, [r0 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 punpcklbw m6, m0, m4 punpckhbw m1, m0, m4 vperm2i128 m0, m6, m1, 0x20 vperm2i128 m6, m6, m1, 0x31 pmaddubsw m1, m0, [r5 + mmsize] paddw m5, m1 pmaddubsw m0, [r5] pmaddubsw m1, m6, [r5 + mmsize] paddw m2, m1 pmaddubsw m6, [r5] %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 2 pmulhrsw m5, m7 ; m5 = word: row 3 packuswb m2, m5 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm5 %else psubw m2, m7 ; m2 = word: row 2 psubw m5, m7 ; m5 = word: row 3 movu [r2 + r3 * 2], m2 movu [r2 + r6], m5 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] lea r0, [r0 + r1 * 4] vinserti128 m1, m1, [r0], 1 vinserti128 m4, m4, xm1, 1 punpcklbw m2, m4, m1 punpckhbw m5, m4, m1 vperm2i128 m3, m2, m5, 0x20 vperm2i128 m2, m2, m5, 0x31 pmaddubsw m5, m3, [r5 + mmsize] paddw m6, m5 pmaddubsw m3, [r5] pmaddubsw m5, m2, [r5 + mmsize] paddw m0, m5 pmaddubsw m2, [r5] %ifidn %1,pp pmulhrsw m6, m7 ; m6 = word: row 4 pmulhrsw m0, m7 ; m0 = word: row 5 packuswb m6, m0 vpermq m6, m6, 11011000b vextracti128 xm0, m6, 1 movu [r2], xm6 movu [r2 + r3], xm0 %else psubw m6, m7 ; m6 = word: row 4 psubw m0, m7 ; m0 = word: row 5 movu [r2], m6 movu [r2 + r3], m0 %endif movu xm6, [r0 + r1 * 2] vinserti128 m6, m6, [r0 + r1], 1 vextracti128 xm0, m6, 1 vinserti128 m1, m1, xm0, 0 punpcklbw m4, m1, m6 punpckhbw m5, m1, m6 vperm2i128 m0, m4, m5, 0x20 vperm2i128 m5, m4, m5, 0x31 pmaddubsw m4, m0, [r5 + mmsize] paddw m2, m4 pmaddubsw m0, [r5] pmaddubsw m4, m5, [r5 + mmsize] paddw m3, m4 pmaddubsw m5, [r5] %ifidn %1,pp pmulhrsw m3, m7 ; m3 = word: row 6 pmulhrsw m2, m7 ; m2 = word: row 7 packuswb m3, m2 vpermq m3, m3, 11011000b vextracti128 xm2, m3, 1 movu [r2 + r3 * 2], xm3 movu [r2 + r6], xm2 %else psubw m3, m7 ; m3 = word: row 6 psubw m2, m7 ; m2 = word: row 7 movu [r2 + r3 * 2], m3 movu [r2 + r6], m2 %endif lea r2, [r2 + r3 * 4] movu xm3, [r0 + r4] lea r0, [r0 + r1 * 4] vinserti128 m3, m3, [r0], 1 vinserti128 m6, m6, xm3, 1 punpcklbw m2, m6, m3 punpckhbw m1, m6, m3 vperm2i128 m4, m2, m1, 0x20 vperm2i128 m2, m2, m1, 0x31 pmaddubsw m1, m4, [r5 + mmsize] paddw m5, m1 pmaddubsw m4, [r5] pmaddubsw m1, m2, [r5 + mmsize] paddw m0, m1 pmaddubsw m2, [r5] %ifidn %1,pp pmulhrsw m5, m7 ; m5 = word: row 8 pmulhrsw m0, m7 ; m0 = word: row 9 packuswb m5, m0 vpermq m5, m5, 11011000b vextracti128 xm0, m5, 1 movu [r2], xm5 movu [r2 + r3], xm0 %else psubw m5, m7 ; m5 = word: row 8 psubw m0, m7 ; m0 = word: row 9 movu [r2], m5 movu [r2 + r3], m0 %endif movu xm5, [r0 + r1 * 2] vinserti128 m5, m5, [r0 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 punpcklbw m1, m3, m5 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m1, m6, [r5 + mmsize] paddw m2, m1 pmaddubsw m6, [r5] pmaddubsw m1, m0, [r5 + mmsize] paddw m4, m1 pmaddubsw m0, [r5] %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 10 pmulhrsw m2, m7 ; m2 = word: row 11 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 movu [r2 + r3 * 2], xm4 movu [r2 + r6], xm2 %else psubw m4, m7 ; m4 = word: row 10 psubw m2, m7 ; m2 = word: row 11 movu [r2 + r3 * 2], m4 movu [r2 + r6], m2 %endif lea r2, [r2 + r3 * 4] movu xm3, [r0 + r4] lea r0, [r0 + r1 * 4] vinserti128 m3, m3, [r0], 1 vinserti128 m5, m5, xm3, 1 punpcklbw m2, m5, m3 punpckhbw m1, m5, m3 vperm2i128 m4, m2, m1, 0x20 vperm2i128 m2, m2, m1, 0x31 pmaddubsw m1, m4, [r5 + mmsize] paddw m0, m1 pmaddubsw m4, [r5] pmaddubsw m1, m2, [r5 + mmsize] paddw m6, m1 pmaddubsw m2, [r5] %ifidn %1,pp pmulhrsw m0, m7 ; m0 = word: row 12 pmulhrsw m6, m7 ; m6 = word: row 13 packuswb m0, m6 vpermq m0, m0, 11011000b vextracti128 xm6, m0, 1 movu [r2], xm0 movu [r2 + r3], xm6 %else psubw m0, m7 ; m0 = word: row 12 psubw m6, m7 ; m6 = word: row 13 movu [r2], m0 movu [r2 + r3], m6 %endif movu xm5, [r0 + r1 * 2] vinserti128 m5, m5, [r0 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 punpcklbw m1, m3, m5 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m6, [r5 + mmsize] paddw m2, m6 pmaddubsw m0, [r5 + mmsize] paddw m4, m0 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 14 pmulhrsw m2, m7 ; m2 = word: row 15 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 movu [r2 + r3 * 2], xm4 movu [r2 + r6], xm2 %else psubw m4, m7 ; m4 = word: row 14 psubw m2, m7 ; m2 = word: row 15 movu [r2 + r3 * 2], m4 movu [r2 + r6], m2 %endif lea r2, [r2 + r3 * 4] dec r7d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_AVX2_16xN pp, 32 FILTER_VER_CHROMA_AVX2_16xN ps, 32 FILTER_VER_CHROMA_AVX2_16xN pp, 64 FILTER_VER_CHROMA_AVX2_16xN ps, 64 %macro FILTER_VER_CHROMA_AVX2_16x24 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_16x24, 4, 6, 15 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m12, [r5] mova m13, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif lea r5, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, m12 movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, m12 movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, m13 paddw m0, m4 pmaddubsw m2, m12 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, m13 paddw m1, m5 pmaddubsw m3, m12 movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, m13 paddw m2, m6 pmaddubsw m4, m12 movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, m13 paddw m3, m7 pmaddubsw m5, m12 movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, m13 paddw m4, m8 pmaddubsw m6, m12 lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, m13 paddw m5, m9 pmaddubsw m7, m12 movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, m13 paddw m6, m10 pmaddubsw m8, m12 movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, m13 paddw m7, m11 pmaddubsw m9, m12 %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 vpermq m0, m0, q3120 vpermq m2, m2, q3120 vpermq m4, m4, q3120 vpermq m6, m6, q3120 vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r5], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r5], xm7 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r5], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 movu [r2 + r3 * 2], m6 movu [r2 + r5], m7 %endif lea r2, [r2 + r3 * 4] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm6, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm6, 1 pmaddubsw m6, m10, m13 paddw m8, m6 pmaddubsw m10, m12 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 12 punpckhbw xm7, xm11, xm6 punpcklbw xm11, xm6 vinserti128 m11, m11, xm7, 1 pmaddubsw m7, m11, m13 paddw m9, m7 pmaddubsw m11, m12 movu xm7, [r0 + r1] ; m7 = row 13 punpckhbw xm0, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm0, 1 pmaddubsw m0, m6, m13 paddw m10, m0 pmaddubsw m6, m12 movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm7, xm0 punpcklbw xm7, xm0 vinserti128 m7, m7, xm1, 1 pmaddubsw m1, m7, m13 paddw m11, m1 pmaddubsw m7, m12 movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, m13 paddw m6, m2 pmaddubsw m0, m12 lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, m13 paddw m7, m3 pmaddubsw m1, m12 movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, m13 paddw m0, m4 pmaddubsw m2, m12 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, m13 paddw m1, m5 pmaddubsw m3, m12 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m6, m14 ; m6 = word: row 12 pmulhrsw m7, m14 ; m7 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m6, m7 packuswb m0, m1 vpermq m8, m8, q3120 vpermq m10, m10, q3120 vpermq m6, m6, q3120 vpermq m0, m0, q3120 vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm7, m6, 1 vextracti128 xm1, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r5], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm6 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm0 movu [r2 + r5], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m6, m14 ; m6 = word: row 12 psubw m7, m14 ; m7 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 movu [r2 + r5], m11 lea r2, [r2 + r3 * 4] movu [r2], m6 movu [r2 + r3], m7 movu [r2 + r3 * 2], m0 movu [r2 + r5], m1 %endif lea r2, [r2 + r3 * 4] movu xm5, [r0 + r4] ; m5 = row 19 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, m13 paddw m2, m6 pmaddubsw m4, m12 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 20 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, m13 paddw m3, m7 pmaddubsw m5, m12 movu xm7, [r0 + r1] ; m7 = row 21 punpckhbw xm0, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm0, 1 pmaddubsw m0, m6, m13 paddw m4, m0 pmaddubsw m6, m12 movu xm0, [r0 + r1 * 2] ; m0 = row 22 punpckhbw xm1, xm7, xm0 punpcklbw xm7, xm0 vinserti128 m7, m7, xm1, 1 pmaddubsw m1, m7, m13 paddw m5, m1 pmaddubsw m7, m12 movu xm1, [r0 + r4] ; m1 = row 23 punpckhbw xm8, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm8, 1 pmaddubsw m8, m0, m13 paddw m6, m8 pmaddubsw m0, m12 lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 24 punpckhbw xm9, xm1, xm8 punpcklbw xm1, xm8 vinserti128 m1, m1, xm9, 1 pmaddubsw m9, m1, m13 paddw m7, m9 pmaddubsw m1, m12 movu xm9, [r0 + r1] ; m9 = row 25 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m8, m13 paddw m0, m8 movu xm10, [r0 + r1 * 2] ; m10 = row 26 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m9, m13 paddw m1, m9 %ifidn %1,pp pmulhrsw m2, m14 ; m2 = word: row 16 pmulhrsw m3, m14 ; m3 = word: row 17 pmulhrsw m4, m14 ; m4 = word: row 18 pmulhrsw m5, m14 ; m5 = word: row 19 pmulhrsw m6, m14 ; m6 = word: row 20 pmulhrsw m7, m14 ; m7 = word: row 21 pmulhrsw m0, m14 ; m0 = word: row 22 pmulhrsw m1, m14 ; m1 = word: row 23 packuswb m2, m3 packuswb m4, m5 packuswb m6, m7 packuswb m0, m1 vpermq m2, m2, q3120 vpermq m4, m4, q3120 vpermq m6, m6, q3120 vpermq m0, m0, q3120 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 vextracti128 xm1, m0, 1 movu [r2], xm2 movu [r2 + r3], xm3 movu [r2 + r3 * 2], xm4 movu [r2 + r5], xm5 lea r2, [r2 + r3 * 4] movu [r2], xm6 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm0 movu [r2 + r5], xm1 %else psubw m2, m14 ; m2 = word: row 16 psubw m3, m14 ; m3 = word: row 17 psubw m4, m14 ; m4 = word: row 18 psubw m5, m14 ; m5 = word: row 19 psubw m6, m14 ; m6 = word: row 20 psubw m7, m14 ; m7 = word: row 21 psubw m0, m14 ; m0 = word: row 22 psubw m1, m14 ; m1 = word: row 23 movu [r2], m2 movu [r2 + r3], m3 movu [r2 + r3 * 2], m4 movu [r2 + r5], m5 lea r2, [r2 + r3 * 4] movu [r2], m6 movu [r2 + r3], m7 movu [r2 + r3 * 2], m0 movu [r2 + r5], m1 %endif RET %endif %endmacro FILTER_VER_CHROMA_AVX2_16x24 pp FILTER_VER_CHROMA_AVX2_16x24 ps %macro FILTER_VER_CHROMA_AVX2_24x32 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m8, [r5] mova m9, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d vbroadcasti128 m7, [pw_2000] %endif lea r6, [r3 * 3] mov r5d, 2 .loopH: movu xm0, [r0] vinserti128 m0, m0, [r0 + r1 * 2], 1 movu xm1, [r0 + r1] vinserti128 m1, m1, [r0 + r4], 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vperm2i128 m4, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 pmaddubsw m4, m8 pmaddubsw m3, m2, m9 paddw m4, m3 pmaddubsw m2, m8 vextracti128 xm0, m0, 1 lea r7, [r0 + r1 * 4] vinserti128 m0, m0, [r7], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 vperm2i128 m6, m5, m3, 0x20 vperm2i128 m5, m5, m3, 0x31 pmaddubsw m6, m8 pmaddubsw m3, m5, m9 paddw m6, m3 pmaddubsw m5, m8 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 0 pmulhrsw m6, m7 ; m6 = word: row 1 packuswb m4, m6 vpermq m4, m4, 11011000b vextracti128 xm6, m4, 1 movu [r2], xm4 movu [r2 + r3], xm6 %else psubw m4, m7 ; m4 = word: row 0 psubw m6, m7 ; m6 = word: row 1 movu [r2], m4 movu [r2 + r3], m6 %endif movu xm4, [r7 + r1 * 2] vinserti128 m4, m4, [r7 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 punpcklbw m6, m0, m4 punpckhbw m1, m0, m4 vperm2i128 m0, m6, m1, 0x20 vperm2i128 m6, m6, m1, 0x31 pmaddubsw m1, m0, m9 paddw m5, m1 pmaddubsw m0, m8 pmaddubsw m1, m6, m9 paddw m2, m1 pmaddubsw m6, m8 %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 2 pmulhrsw m5, m7 ; m5 = word: row 3 packuswb m2, m5 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm5 %else psubw m2, m7 ; m2 = word: row 2 psubw m5, m7 ; m5 = word: row 3 movu [r2 + r3 * 2], m2 movu [r2 + r6], m5 %endif lea r8, [r2 + r3 * 4] movu xm1, [r7 + r4] lea r7, [r7 + r1 * 4] vinserti128 m1, m1, [r7], 1 vinserti128 m4, m4, xm1, 1 punpcklbw m2, m4, m1 punpckhbw m5, m4, m1 vperm2i128 m3, m2, m5, 0x20 vperm2i128 m2, m2, m5, 0x31 pmaddubsw m5, m3, m9 paddw m6, m5 pmaddubsw m3, m8 pmaddubsw m5, m2, m9 paddw m0, m5 pmaddubsw m2, m8 %ifidn %1,pp pmulhrsw m6, m7 ; m6 = word: row 4 pmulhrsw m0, m7 ; m0 = word: row 5 packuswb m6, m0 vpermq m6, m6, 11011000b vextracti128 xm0, m6, 1 movu [r8], xm6 movu [r8 + r3], xm0 %else psubw m6, m7 ; m6 = word: row 4 psubw m0, m7 ; m0 = word: row 5 movu [r8], m6 movu [r8 + r3], m0 %endif movu xm6, [r7 + r1 * 2] vinserti128 m6, m6, [r7 + r1], 1 vextracti128 xm0, m6, 1 vinserti128 m1, m1, xm0, 0 punpcklbw m4, m1, m6 punpckhbw m5, m1, m6 vperm2i128 m0, m4, m5, 0x20 vperm2i128 m5, m4, m5, 0x31 pmaddubsw m4, m0, m9 paddw m2, m4 pmaddubsw m0, m8 pmaddubsw m4, m5, m9 paddw m3, m4 pmaddubsw m5, m8 %ifidn %1,pp pmulhrsw m3, m7 ; m3 = word: row 6 pmulhrsw m2, m7 ; m2 = word: row 7 packuswb m3, m2 vpermq m3, m3, 11011000b vextracti128 xm2, m3, 1 movu [r8 + r3 * 2], xm3 movu [r8 + r6], xm2 %else psubw m3, m7 ; m3 = word: row 6 psubw m2, m7 ; m2 = word: row 7 movu [r8 + r3 * 2], m3 movu [r8 + r6], m2 %endif lea r8, [r8 + r3 * 4] movu xm3, [r7 + r4] lea r7, [r7 + r1 * 4] vinserti128 m3, m3, [r7], 1 vinserti128 m6, m6, xm3, 1 punpcklbw m2, m6, m3 punpckhbw m1, m6, m3 vperm2i128 m4, m2, m1, 0x20 vperm2i128 m2, m2, m1, 0x31 pmaddubsw m1, m4, m9 paddw m5, m1 pmaddubsw m4, m8 pmaddubsw m1, m2, m9 paddw m0, m1 pmaddubsw m2, m8 %ifidn %1,pp pmulhrsw m5, m7 ; m5 = word: row 8 pmulhrsw m0, m7 ; m0 = word: row 9 packuswb m5, m0 vpermq m5, m5, 11011000b vextracti128 xm0, m5, 1 movu [r8], xm5 movu [r8 + r3], xm0 %else psubw m5, m7 ; m5 = word: row 8 psubw m0, m7 ; m0 = word: row 9 movu [r8], m5 movu [r8 + r3], m0 %endif movu xm5, [r7 + r1 * 2] vinserti128 m5, m5, [r7 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 punpcklbw m1, m3, m5 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m1, m6, m9 paddw m2, m1 pmaddubsw m6, m8 pmaddubsw m1, m0, m9 paddw m4, m1 pmaddubsw m0, m8 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 10 pmulhrsw m2, m7 ; m2 = word: row 11 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 movu [r8 + r3 * 2], xm4 movu [r8 + r6], xm2 %else psubw m4, m7 ; m4 = word: row 10 psubw m2, m7 ; m2 = word: row 11 movu [r8 + r3 * 2], m4 movu [r8 + r6], m2 %endif lea r8, [r8 + r3 * 4] movu xm3, [r7 + r4] lea r7, [r7 + r1 * 4] vinserti128 m3, m3, [r7], 1 vinserti128 m5, m5, xm3, 1 punpcklbw m2, m5, m3 punpckhbw m1, m5, m3 vperm2i128 m4, m2, m1, 0x20 vperm2i128 m2, m2, m1, 0x31 pmaddubsw m1, m4, m9 paddw m0, m1 pmaddubsw m4, m8 pmaddubsw m1, m2, m9 paddw m6, m1 pmaddubsw m2, m8 %ifidn %1,pp pmulhrsw m0, m7 ; m0 = word: row 12 pmulhrsw m6, m7 ; m6 = word: row 13 packuswb m0, m6 vpermq m0, m0, 11011000b vextracti128 xm6, m0, 1 movu [r8], xm0 movu [r8 + r3], xm6 %else psubw m0, m7 ; m0 = word: row 12 psubw m6, m7 ; m6 = word: row 13 movu [r8], m0 movu [r8 + r3], m6 %endif movu xm5, [r7 + r1 * 2] vinserti128 m5, m5, [r7 + r1], 1 vextracti128 xm0, m5, 1 vinserti128 m3, m3, xm0, 0 punpcklbw m1, m3, m5 punpckhbw m0, m3, m5 vperm2i128 m6, m1, m0, 0x20 vperm2i128 m0, m1, m0, 0x31 pmaddubsw m6, m9 paddw m2, m6 pmaddubsw m0, m9 paddw m4, m0 %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 14 pmulhrsw m2, m7 ; m2 = word: row 15 packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm2, m4, 1 movu [r8 + r3 * 2], xm4 movu [r8 + r6], xm2 add r2, 16 %else psubw m4, m7 ; m4 = word: row 14 psubw m2, m7 ; m2 = word: row 15 movu [r8 + r3 * 2], m4 movu [r8 + r6], m2 add r2, 32 %endif add r0, 16 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 vinserti128 m5, m1, xm2, 1 pmaddubsw m5, m8 movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 lea r7, [r0 + r1 * 4] movq xm1, [r7] ; m1 = row 4 punpcklbw xm4, xm1 vinserti128 m2, m3, xm4, 1 pmaddubsw m0, m2, m9 paddw m5, m0 pmaddubsw m2, m8 movq xm3, [r7 + r1] ; m3 = row 5 punpcklbw xm1, xm3 movq xm4, [r7 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m0, m1, m9 paddw m2, m0 pmaddubsw m1, m8 movq xm3, [r7 + r4] ; m3 = row 7 punpcklbw xm4, xm3 lea r7, [r7 + r1 * 4] movq xm0, [r7] ; m0 = row 8 punpcklbw xm3, xm0 vinserti128 m4, m4, xm3, 1 pmaddubsw m3, m4, m9 paddw m1, m3 pmaddubsw m4, m8 movq xm3, [r7 + r1] ; m3 = row 9 punpcklbw xm0, xm3 movq xm6, [r7 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 vinserti128 m0, m0, xm3, 1 pmaddubsw m3, m0, m9 paddw m4, m3 pmaddubsw m0, m8 %ifidn %1,pp pmulhrsw m5, m7 ; m5 = word: row 0, row 1 pmulhrsw m2, m7 ; m2 = word: row 2, row 3 pmulhrsw m1, m7 ; m1 = word: row 4, row 5 pmulhrsw m4, m7 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 lea r8, [r2 + r3 * 4] movq [r8], xm1 movq [r8 + r3], xm4 movhps [r8 + r3 * 2], xm1 movhps [r8 + r6], xm4 %else psubw m5, m7 ; m5 = word: row 0, row 1 psubw m2, m7 ; m2 = word: row 2, row 3 psubw m1, m7 ; m1 = word: row 4, row 5 psubw m4, m7 ; m4 = word: row 6, row 7 vextracti128 xm3, m5, 1 movu [r2], xm5 movu [r2 + r3], xm3 vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 vextracti128 xm3, m1, 1 lea r8, [r2 + r3 * 4] movu [r8], xm1 movu [r8 + r3], xm3 vextracti128 xm3, m4, 1 movu [r8 + r3 * 2], xm4 movu [r8 + r6], xm3 %endif lea r8, [r8 + r3 * 4] movq xm3, [r7 + r4] ; m3 = row 11 punpcklbw xm6, xm3 lea r7, [r7 + r1 * 4] movq xm5, [r7] ; m5 = row 12 punpcklbw xm3, xm5 vinserti128 m6, m6, xm3, 1 pmaddubsw m3, m6, m9 paddw m0, m3 pmaddubsw m6, m8 movq xm3, [r7 + r1] ; m3 = row 13 punpcklbw xm5, xm3 movq xm2, [r7 + r1 * 2] ; m2 = row 14 punpcklbw xm3, xm2 vinserti128 m5, m5, xm3, 1 pmaddubsw m3, m5, m9 paddw m6, m3 pmaddubsw m5, m8 movq xm3, [r7 + r4] ; m3 = row 15 punpcklbw xm2, xm3 lea r7, [r7 + r1 * 4] movq xm1, [r7] ; m1 = row 16 punpcklbw xm3, xm1 vinserti128 m2, m2, xm3, 1 pmaddubsw m3, m2, m9 paddw m5, m3 pmaddubsw m2, m8 movq xm3, [r7 + r1] ; m3 = row 17 punpcklbw xm1, xm3 movq xm4, [r7 + r1 * 2] ; m4 = row 18 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, m9 paddw m2, m3 %ifidn %1,pp pmulhrsw m0, m7 ; m0 = word: row 8, row 9 pmulhrsw m6, m7 ; m6 = word: row 10, row 11 pmulhrsw m5, m7 ; m5 = word: row 12, row 13 pmulhrsw m2, m7 ; m2 = word: row 14, row 15 packuswb m0, m6 packuswb m5, m2 vextracti128 xm6, m0, 1 vextracti128 xm2, m5, 1 movq [r8], xm0 movq [r8 + r3], xm6 movhps [r8 + r3 * 2], xm0 movhps [r8 + r6], xm6 lea r8, [r8 + r3 * 4] movq [r8], xm5 movq [r8 + r3], xm2 movhps [r8 + r3 * 2], xm5 movhps [r8 + r6], xm2 lea r2, [r8 + r3 * 4 - 16] %else psubw m0, m7 ; m0 = word: row 8, row 9 psubw m6, m7 ; m6 = word: row 10, row 11 psubw m5, m7 ; m5 = word: row 12, row 13 psubw m2, m7 ; m2 = word: row 14, row 15 vextracti128 xm3, m0, 1 movu [r8], xm0 movu [r8 + r3], xm3 vextracti128 xm3, m6, 1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm3 vextracti128 xm3, m5, 1 lea r8, [r8 + r3 * 4] movu [r8], xm5 movu [r8 + r3], xm3 vextracti128 xm3, m2, 1 movu [r8 + r3 * 2], xm2 movu [r8 + r6], xm3 lea r2, [r8 + r3 * 4 - 32] %endif lea r0, [r7 - 16] dec r5d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_AVX2_24x32 pp FILTER_VER_CHROMA_AVX2_24x32 ps %macro FILTER_VER_CHROMA_AVX2_24x64 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_24x64, 4, 7, 13 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m10, [r5] mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m12, [pw_512] %else add r3d, r3d vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] mov r6d, 16 .loopH: movu m0, [r0] ; m0 = row 0 movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 pmaddubsw m2, m10 pmaddubsw m3, m10 movu m0, [r0 + r1 * 2] ; m0 = row 2 punpcklbw m4, m1, m0 punpckhbw m5, m1, m0 pmaddubsw m4, m10 pmaddubsw m5, m10 movu m1, [r0 + r4] ; m1 = row 3 punpcklbw m6, m0, m1 punpckhbw m7, m0, m1 pmaddubsw m8, m6, m11 pmaddubsw m9, m7, m11 pmaddubsw m6, m10 pmaddubsw m7, m10 paddw m2, m8 paddw m3, m9 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2], xm2 vextracti128 xm2, m2, 1 movq [r2 + 16], xm2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2], m0 movu [r2 + mmsize], xm2 %endif lea r0, [r0 + r1 * 4] movu m0, [r0] ; m0 = row 4 punpcklbw m2, m1, m0 punpckhbw m3, m1, m0 pmaddubsw m8, m2, m11 pmaddubsw m9, m3, m11 pmaddubsw m2, m10 pmaddubsw m3, m10 paddw m4, m8 paddw m5, m9 %ifidn %1,pp pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 movu [r2 + r3], xm4 vextracti128 xm4, m4, 1 movq [r2 + r3 + 16], xm4 %else psubw m4, m12 psubw m5, m12 vperm2i128 m1, m4, m5, 0x20 vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 movu [r2 + r3 + mmsize], xm4 %endif movu m1, [r0 + r1] ; m1 = row 5 punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m6, m4 paddw m7, m5 %ifidn %1,pp pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 movu [r2 + r3 * 2], xm6 vextracti128 xm6, m6, 1 movq [r2 + r3 * 2 + 16], xm6 %else psubw m6, m12 psubw m7, m12 vperm2i128 m0, m6, m7, 0x20 vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 movu [r2 + r3 * 2 + mmsize], xm6 %endif movu m0, [r0 + r1 * 2] ; m0 = row 6 punpcklbw m6, m1, m0 punpckhbw m7, m1, m0 pmaddubsw m6, m11 pmaddubsw m7, m11 paddw m2, m6 paddw m3, m7 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2 + r5], xm2 vextracti128 xm2, m2, 1 movq [r2 + r5 + 16], xm2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2 + r5], m0 movu [r2 + r5 + mmsize], xm2 %endif lea r2, [r2 + r3 * 4] dec r6d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_AVX2_24x64 pp FILTER_VER_CHROMA_AVX2_24x64 ps %macro FILTER_VER_CHROMA_AVX2_16x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d mova m7, [pw_2000] %endif movu xm0, [r0] vinserti128 m0, m0, [r0 + r1 * 2], 1 movu xm1, [r0 + r1] vinserti128 m1, m1, [r0 + r4], 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vperm2i128 m4, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 pmaddubsw m4, [r5] pmaddubsw m3, m2, [r5 + mmsize] paddw m4, m3 pmaddubsw m2, [r5] vextracti128 xm0, m0, 1 lea r0, [r0 + r1 * 4] vinserti128 m0, m0, [r0], 1 punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 vperm2i128 m6, m5, m3, 0x20 vperm2i128 m5, m5, m3, 0x31 pmaddubsw m6, [r5] pmaddubsw m3, m5, [r5 + mmsize] paddw m6, m3 pmaddubsw m5, [r5] %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 0 pmulhrsw m6, m7 ; m6 = word: row 1 packuswb m4, m6 vpermq m4, m4, 11011000b vextracti128 xm6, m4, 1 movu [r2], xm4 movu [r2 + r3], xm6 %else psubw m4, m7 ; m4 = word: row 0 psubw m6, m7 ; m6 = word: row 1 movu [r2], m4 movu [r2 + r3], m6 %endif lea r2, [r2 + r3 * 2] movu xm4, [r0 + r1 * 2] vinserti128 m4, m4, [r0 + r1], 1 vextracti128 xm1, m4, 1 vinserti128 m0, m0, xm1, 0 punpcklbw m6, m0, m4 punpckhbw m1, m0, m4 vperm2i128 m0, m6, m1, 0x20 vperm2i128 m6, m6, m1, 0x31 pmaddubsw m0, [r5 + mmsize] paddw m5, m0 pmaddubsw m6, [r5 + mmsize] paddw m2, m6 %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 2 pmulhrsw m5, m7 ; m5 = word: row 3 packuswb m2, m5 vpermq m2, m2, 11011000b vextracti128 xm5, m2, 1 movu [r2], xm2 movu [r2 + r3], xm5 %else psubw m2, m7 ; m2 = word: row 2 psubw m5, m7 ; m5 = word: row 3 movu [r2], m2 movu [r2 + r3], m5 %endif RET %endmacro FILTER_VER_CHROMA_AVX2_16x4 pp FILTER_VER_CHROMA_AVX2_16x4 ps %macro FILTER_VER_CHROMA_AVX2_12xN 2 INIT_YMM avx2 cglobal interp_4tap_vert_%1_12x%2, 4, 7, 8 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m7, [pw_512] %else add r3d, r3d vbroadcasti128 m7, [pw_2000] %endif lea r6, [r3 * 3] %rep %2 / 16 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] %ifidn %1,pp pmulhrsw m0, m7 ; m0 = word: row 0 pmulhrsw m1, m7 ; m1 = word: row 1 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [r2], xm0 movd [r2 + 8], xm1 movhps [r2 + r3], xm0 pextrd [r2 + r3 + 8], xm1, 2 %else psubw m0, m7 ; m0 = word: row 0 psubw m1, m7 ; m1 = word: row 1 movu [r2], xm0 vextracti128 xm0, m0, 1 movq [r2 + 16], xm0 movu [r2 + r3], xm1 vextracti128 xm1, m1, 1 movq [r2 + r3 + 16], xm1 %endif movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm0, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm0, 1 pmaddubsw m0, m5, [r5 + 1 * mmsize] paddw m3, m0 pmaddubsw m5, [r5] %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 2 pmulhrsw m3, m7 ; m3 = word: row 3 packuswb m2, m3 vextracti128 xm3, m2, 1 movq [r2 + r3 * 2], xm2 movd [r2 + r3 * 2 + 8], xm3 movhps [r2 + r6], xm2 pextrd [r2 + r6 + 8], xm3, 2 %else psubw m2, m7 ; m2 = word: row 2 psubw m3, m7 ; m3 = word: row 3 movu [r2 + r3 * 2], xm2 vextracti128 xm2, m2, 1 movq [r2 + r3 * 2 + 16], xm2 movu [r2 + r6], xm3 vextracti128 xm3, m3, 1 movq [r2 + r6 + 16], xm3 %endif lea r2, [r2 + r3 * 4] movu xm0, [r0 + r4] ; m0 = row 7 punpckhbw xm3, xm6, xm0 punpcklbw xm6, xm0 vinserti128 m6, m6, xm3, 1 pmaddubsw m3, m6, [r5 + 1 * mmsize] paddw m4, m3 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm3, [r0] ; m3 = row 8 punpckhbw xm1, xm0, xm3 punpcklbw xm0, xm3 vinserti128 m0, m0, xm1, 1 pmaddubsw m1, m0, [r5 + 1 * mmsize] paddw m5, m1 pmaddubsw m0, [r5] %ifidn %1,pp pmulhrsw m4, m7 ; m4 = word: row 4 pmulhrsw m5, m7 ; m5 = word: row 5 packuswb m4, m5 vextracti128 xm5, m4, 1 movq [r2], xm4 movd [r2 + 8], xm5 movhps [r2 + r3], xm4 pextrd [r2 + r3 + 8], xm5, 2 %else psubw m4, m7 ; m4 = word: row 4 psubw m5, m7 ; m5 = word: row 5 movu [r2], xm4 vextracti128 xm4, m4, 1 movq [r2 + 16], xm4 movu [r2 + r3], xm5 vextracti128 xm5, m5, 1 movq [r2 + r3 + 16], xm5 %endif movu xm1, [r0 + r1] ; m1 = row 9 punpckhbw xm2, xm3, xm1 punpcklbw xm3, xm1 vinserti128 m3, m3, xm2, 1 pmaddubsw m2, m3, [r5 + 1 * mmsize] paddw m6, m2 pmaddubsw m3, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 10 punpckhbw xm4, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm4, 1 pmaddubsw m4, m1, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m1, [r5] %ifidn %1,pp pmulhrsw m6, m7 ; m6 = word: row 6 pmulhrsw m0, m7 ; m0 = word: row 7 packuswb m6, m0 vextracti128 xm0, m6, 1 movq [r2 + r3 * 2], xm6 movd [r2 + r3 * 2 + 8], xm0 movhps [r2 + r6], xm6 pextrd [r2 + r6 + 8], xm0, 2 %else psubw m6, m7 ; m6 = word: row 6 psubw m0, m7 ; m0 = word: row 7 movu [r2 + r3 * 2], xm6 vextracti128 xm6, m6, 1 movq [r2 + r3 * 2 + 16], xm6 movu [r2 + r6], xm0 vextracti128 xm0, m0, 1 movq [r2 + r6 + 16], xm0 %endif lea r2, [r2 + r3 * 4] movu xm4, [r0 + r4] ; m4 = row 11 punpckhbw xm6, xm2, xm4 punpcklbw xm2, xm4 vinserti128 m2, m2, xm6, 1 pmaddubsw m6, m2, [r5 + 1 * mmsize] paddw m3, m6 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 12 punpckhbw xm0, xm4, xm6 punpcklbw xm4, xm6 vinserti128 m4, m4, xm0, 1 pmaddubsw m0, m4, [r5 + 1 * mmsize] paddw m1, m0 pmaddubsw m4, [r5] %ifidn %1,pp pmulhrsw m3, m7 ; m3 = word: row 8 pmulhrsw m1, m7 ; m1 = word: row 9 packuswb m3, m1 vextracti128 xm1, m3, 1 movq [r2], xm3 movd [r2 + 8], xm1 movhps [r2 + r3], xm3 pextrd [r2 + r3 + 8], xm1, 2 %else psubw m3, m7 ; m3 = word: row 8 psubw m1, m7 ; m1 = word: row 9 movu [r2], xm3 vextracti128 xm3, m3, 1 movq [r2 + 16], xm3 movu [r2 + r3], xm1 vextracti128 xm1, m1, 1 movq [r2 + r3 + 16], xm1 %endif movu xm0, [r0 + r1] ; m0 = row 13 punpckhbw xm1, xm6, xm0 punpcklbw xm6, xm0 vinserti128 m6, m6, xm1, 1 pmaddubsw m1, m6, [r5 + 1 * mmsize] paddw m2, m1 pmaddubsw m6, [r5] movu xm1, [r0 + r1 * 2] ; m1 = row 14 punpckhbw xm5, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm5, 1 pmaddubsw m5, m0, [r5 + 1 * mmsize] paddw m4, m5 pmaddubsw m0, [r5] %ifidn %1,pp pmulhrsw m2, m7 ; m2 = word: row 10 pmulhrsw m4, m7 ; m4 = word: row 11 packuswb m2, m4 vextracti128 xm4, m2, 1 movq [r2 + r3 * 2], xm2 movd [r2 + r3 * 2 + 8], xm4 movhps [r2 + r6], xm2 pextrd [r2 + r6 + 8], xm4, 2 %else psubw m2, m7 ; m2 = word: row 10 psubw m4, m7 ; m4 = word: row 11 movu [r2 + r3 * 2], xm2 vextracti128 xm2, m2, 1 movq [r2 + r3 * 2 + 16], xm2 movu [r2 + r6], xm4 vextracti128 xm4, m4, 1 movq [r2 + r6 + 16], xm4 %endif lea r2, [r2 + r3 * 4] movu xm5, [r0 + r4] ; m5 = row 15 punpckhbw xm2, xm1, xm5 punpcklbw xm1, xm5 vinserti128 m1, m1, xm2, 1 pmaddubsw m2, m1, [r5 + 1 * mmsize] paddw m6, m2 pmaddubsw m1, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm5, xm2 punpcklbw xm5, xm2 vinserti128 m5, m5, xm3, 1 pmaddubsw m3, m5, [r5 + 1 * mmsize] paddw m0, m3 pmaddubsw m5, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m2, [r5 + 1 * mmsize] paddw m1, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm2, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm2, 1 pmaddubsw m3, [r5 + 1 * mmsize] paddw m5, m3 %ifidn %1,pp pmulhrsw m6, m7 ; m6 = word: row 12 pmulhrsw m0, m7 ; m0 = word: row 13 pmulhrsw m1, m7 ; m1 = word: row 14 pmulhrsw m5, m7 ; m5 = word: row 15 packuswb m6, m0 packuswb m1, m5 vextracti128 xm0, m6, 1 vextracti128 xm5, m1, 1 movq [r2], xm6 movd [r2 + 8], xm0 movhps [r2 + r3], xm6 pextrd [r2 + r3 + 8], xm0, 2 movq [r2 + r3 * 2], xm1 movd [r2 + r3 * 2 + 8], xm5 movhps [r2 + r6], xm1 pextrd [r2 + r6 + 8], xm5, 2 %else psubw m6, m7 ; m6 = word: row 12 psubw m0, m7 ; m0 = word: row 13 psubw m1, m7 ; m1 = word: row 14 psubw m5, m7 ; m5 = word: row 15 movu [r2], xm6 vextracti128 xm6, m6, 1 movq [r2 + 16], xm6 movu [r2 + r3], xm0 vextracti128 xm0, m0, 1 movq [r2 + r3 + 16], xm0 movu [r2 + r3 * 2], xm1 vextracti128 xm1, m1, 1 movq [r2 + r3 * 2 + 16], xm1 movu [r2 + r6], xm5 vextracti128 xm5, m5, 1 movq [r2 + r6 + 16], xm5 %endif lea r2, [r2 + r3 * 4] %endrep RET %endmacro FILTER_VER_CHROMA_AVX2_12xN pp, 16 FILTER_VER_CHROMA_AVX2_12xN ps, 16 FILTER_VER_CHROMA_AVX2_12xN pp, 32 FILTER_VER_CHROMA_AVX2_12xN ps, 32 ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W24 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m7, [r5 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_512] pmulhrsw m4, m6 pmulhrsw m2, m6 packuswb m4, m2 movu [r2], m4 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m2, [r5 + 2 * r1] punpcklbw m5, m7, m2 punpckhbw m7, m2 pmaddubsw m5, m0 pmaddubsw m7, m0 paddw m4, m5 paddw m3, m7 pmulhrsw m4, m6 pmulhrsw m3, m6 packuswb m4, m3 movu [r2 + r3], m4 movq m2, [r0 + 16] movq m3, [r0 + r1 + 16] movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] punpcklbw m2, m3 punpcklbw m4, m5 pmaddubsw m2, m1 pmaddubsw m4, m0 paddw m2, m4 pmulhrsw m2, m6 movq m3, [r0 + r1 + 16] movq m4, [r5 + 16] movq m5, [r5 + r1 + 16] movq m7, [r5 + 2 * r1 + 16] punpcklbw m3, m4 punpcklbw m5, m7 pmaddubsw m3, m1 pmaddubsw m5, m0 paddw m3, m5 pmulhrsw m3, m6 packuswb m2, m3 movh [r2 + 16], m2 movhps [r2 + r3 + 16], m2 mov r0, r5 lea r2, [r2 + 2 * r3] sub r4, 2 jnz .loop RET %endmacro FILTER_V4_W24 24, 32 FILTER_V4_W24 24, 64 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W32 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mova m7, [pw_512] mov r4d, %2 .loop: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m3, [r5] movu m5, [r5 + r1] punpcklbw m6, m3, m5 punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 paddw m4, m6 paddw m2, m3 pmulhrsw m4, m7 pmulhrsw m2, m7 packuswb m4, m2 movu [r2], m4 movu m2, [r0 + 16] movu m3, [r0 + r1 + 16] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 movu m3, [r5 + 16] movu m5, [r5 + r1 + 16] punpcklbw m6, m3, m5 punpckhbw m3, m5 pmaddubsw m6, m0 pmaddubsw m3, m0 paddw m4, m6 paddw m2, m3 pmulhrsw m4, m7 pmulhrsw m2, m7 packuswb m4, m2 movu [r2 + 16], m4 lea r0, [r0 + r1] lea r2, [r2 + r3] dec r4 jnz .loop RET %endmacro FILTER_V4_W32 32, 8 FILTER_V4_W32 32, 16 FILTER_V4_W32 32, 24 FILTER_V4_W32 32, 32 FILTER_V4_W32 32, 48 FILTER_V4_W32 32, 64 %macro FILTER_VER_CHROMA_AVX2_32xN 2 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m10, [r5] mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m12, [pw_512] %else add r3d, r3d vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] mov r6d, %2 / 4 .loopW: movu m0, [r0] ; m0 = row 0 movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 pmaddubsw m2, m10 pmaddubsw m3, m10 movu m0, [r0 + r1 * 2] ; m0 = row 2 punpcklbw m4, m1, m0 punpckhbw m5, m1, m0 pmaddubsw m4, m10 pmaddubsw m5, m10 movu m1, [r0 + r4] ; m1 = row 3 punpcklbw m6, m0, m1 punpckhbw m7, m0, m1 pmaddubsw m8, m6, m11 pmaddubsw m9, m7, m11 pmaddubsw m6, m10 pmaddubsw m7, m10 paddw m2, m8 paddw m3, m9 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2], m0 movu [r2 + mmsize], m2 %endif lea r0, [r0 + r1 * 4] movu m0, [r0] ; m0 = row 4 punpcklbw m2, m1, m0 punpckhbw m3, m1, m0 pmaddubsw m8, m2, m11 pmaddubsw m9, m3, m11 pmaddubsw m2, m10 pmaddubsw m3, m10 paddw m4, m8 paddw m5, m9 %ifidn %1,pp pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 movu [r2 + r3], m4 %else psubw m4, m12 psubw m5, m12 vperm2i128 m1, m4, m5, 0x20 vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 movu [r2 + r3 + mmsize], m4 %endif movu m1, [r0 + r1] ; m1 = row 5 punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m6, m4 paddw m7, m5 %ifidn %1,pp pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 movu [r2 + r3 * 2], m6 %else psubw m6, m12 psubw m7, m12 vperm2i128 m0, m6, m7, 0x20 vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 movu [r2 + r3 * 2 + mmsize], m6 %endif movu m0, [r0 + r1 * 2] ; m0 = row 6 punpcklbw m6, m1, m0 punpckhbw m7, m1, m0 pmaddubsw m6, m11 pmaddubsw m7, m11 paddw m2, m6 paddw m3, m7 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2 + r5], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2 + r5], m0 movu [r2 + r5 + mmsize], m2 %endif lea r2, [r2 + r3 * 4] dec r6d jnz .loopW RET %endif %endmacro FILTER_VER_CHROMA_AVX2_32xN pp, 64 FILTER_VER_CHROMA_AVX2_32xN pp, 48 FILTER_VER_CHROMA_AVX2_32xN pp, 32 FILTER_VER_CHROMA_AVX2_32xN pp, 24 FILTER_VER_CHROMA_AVX2_32xN pp, 16 FILTER_VER_CHROMA_AVX2_32xN pp, 8 FILTER_VER_CHROMA_AVX2_32xN ps, 64 FILTER_VER_CHROMA_AVX2_32xN ps, 48 FILTER_VER_CHROMA_AVX2_32xN ps, 32 FILTER_VER_CHROMA_AVX2_32xN ps, 24 FILTER_VER_CHROMA_AVX2_32xN ps, 16 FILTER_VER_CHROMA_AVX2_32xN ps, 8 %macro FILTER_VER_CHROMA_AVX2_48x64 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_48x64, 4, 8, 13 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m10, [r5] mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m12, [pw_512] %else add r3d, r3d vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] lea r7, [r1 * 4] mov r6d, 16 .loopH: movu m0, [r0] ; m0 = row 0 movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 pmaddubsw m2, m10 pmaddubsw m3, m10 movu m0, [r0 + r1 * 2] ; m0 = row 2 punpcklbw m4, m1, m0 punpckhbw m5, m1, m0 pmaddubsw m4, m10 pmaddubsw m5, m10 movu m1, [r0 + r4] ; m1 = row 3 punpcklbw m6, m0, m1 punpckhbw m7, m0, m1 pmaddubsw m8, m6, m11 pmaddubsw m9, m7, m11 pmaddubsw m6, m10 pmaddubsw m7, m10 paddw m2, m8 paddw m3, m9 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2], m0 movu [r2 + mmsize], m2 %endif lea r0, [r0 + r1 * 4] movu m0, [r0] ; m0 = row 4 punpcklbw m2, m1, m0 punpckhbw m3, m1, m0 pmaddubsw m8, m2, m11 pmaddubsw m9, m3, m11 pmaddubsw m2, m10 pmaddubsw m3, m10 paddw m4, m8 paddw m5, m9 %ifidn %1,pp pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 movu [r2 + r3], m4 %else psubw m4, m12 psubw m5, m12 vperm2i128 m1, m4, m5, 0x20 vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 movu [r2 + r3 + mmsize], m4 %endif movu m1, [r0 + r1] ; m1 = row 5 punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m6, m4 paddw m7, m5 %ifidn %1,pp pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 movu [r2 + r3 * 2], m6 %else psubw m6, m12 psubw m7, m12 vperm2i128 m0, m6, m7, 0x20 vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 movu [r2 + r3 * 2 + mmsize], m6 %endif movu m0, [r0 + r1 * 2] ; m0 = row 6 punpcklbw m6, m1, m0 punpckhbw m7, m1, m0 pmaddubsw m6, m11 pmaddubsw m7, m11 paddw m2, m6 paddw m3, m7 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2 + r5], m2 add r2, 32 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2 + r5], m0 movu [r2 + r5 + mmsize], m2 add r2, 64 %endif sub r0, r7 movu xm0, [r0 + 32] ; m0 = row 0 movu xm1, [r0 + r1 + 32] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, m10 movu xm2, [r0 + r1 * 2 + 32] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, m10 movu xm3, [r0 + r4 + 32] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, m11 paddw m0, m4 pmaddubsw m2, m10 lea r0, [r0 + r1 * 4] movu xm4, [r0 + 32] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, m11 paddw m1, m5 pmaddubsw m3, m10 movu xm5, [r0 + r1 + 32] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m4, m11 paddw m2, m4 movu xm6, [r0 + r1 * 2 + 32] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m5, m11 paddw m3, m5 %ifidn %1,pp pmulhrsw m0, m12 ; m0 = word: row 0 pmulhrsw m1, m12 ; m1 = word: row 1 pmulhrsw m2, m12 ; m2 = word: row 2 pmulhrsw m3, m12 ; m3 = word: row 3 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r5], xm3 lea r2, [r2 + r3 * 4 - 32] %else psubw m0, m12 ; m0 = word: row 0 psubw m1, m12 ; m1 = word: row 1 psubw m2, m12 ; m2 = word: row 2 psubw m3, m12 ; m3 = word: row 3 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r5], m3 lea r2, [r2 + r3 * 4 - 64] %endif dec r6d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_AVX2_48x64 pp FILTER_VER_CHROMA_AVX2_48x64 ps %macro FILTER_VER_CHROMA_AVX2_64xN 2 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_64x%2, 4, 8, 13 mov r4d, r4m shl r4d, 6 %ifdef PIC lea r5, [tab_ChromaCoeffVer_32] add r5, r4 %else lea r5, [tab_ChromaCoeffVer_32 + r4] %endif mova m10, [r5] mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 %ifidn %1,pp mova m12, [pw_512] %else add r3d, r3d vbroadcasti128 m12, [pw_2000] %endif lea r5, [r3 * 3] lea r7, [r1 * 4] mov r6d, %2 / 4 .loopH: %assign x 0 %rep 2 movu m0, [r0 + x] ; m0 = row 0 movu m1, [r0 + r1 + x] ; m1 = row 1 punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 pmaddubsw m2, m10 pmaddubsw m3, m10 movu m0, [r0 + r1 * 2 + x] ; m0 = row 2 punpcklbw m4, m1, m0 punpckhbw m5, m1, m0 pmaddubsw m4, m10 pmaddubsw m5, m10 movu m1, [r0 + r4 + x] ; m1 = row 3 punpcklbw m6, m0, m1 punpckhbw m7, m0, m1 pmaddubsw m8, m6, m11 pmaddubsw m9, m7, m11 pmaddubsw m6, m10 pmaddubsw m7, m10 paddw m2, m8 paddw m3, m9 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2], m2 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2], m0 movu [r2 + mmsize], m2 %endif lea r0, [r0 + r1 * 4] movu m0, [r0 + x] ; m0 = row 4 punpcklbw m2, m1, m0 punpckhbw m3, m1, m0 pmaddubsw m8, m2, m11 pmaddubsw m9, m3, m11 pmaddubsw m2, m10 pmaddubsw m3, m10 paddw m4, m8 paddw m5, m9 %ifidn %1,pp pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 movu [r2 + r3], m4 %else psubw m4, m12 psubw m5, m12 vperm2i128 m1, m4, m5, 0x20 vperm2i128 m4, m4, m5, 0x31 movu [r2 + r3], m1 movu [r2 + r3 + mmsize], m4 %endif movu m1, [r0 + r1 + x] ; m1 = row 5 punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m6, m4 paddw m7, m5 %ifidn %1,pp pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 movu [r2 + r3 * 2], m6 %else psubw m6, m12 psubw m7, m12 vperm2i128 m0, m6, m7, 0x20 vperm2i128 m6, m6, m7, 0x31 movu [r2 + r3 * 2], m0 movu [r2 + r3 * 2 + mmsize], m6 %endif movu m0, [r0 + r1 * 2 + x] ; m0 = row 6 punpcklbw m6, m1, m0 punpckhbw m7, m1, m0 pmaddubsw m6, m11 pmaddubsw m7, m11 paddw m2, m6 paddw m3, m7 %ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2 + r5], m2 add r2, 32 %else psubw m2, m12 psubw m3, m12 vperm2i128 m0, m2, m3, 0x20 vperm2i128 m2, m2, m3, 0x31 movu [r2 + r5], m0 movu [r2 + r5 + mmsize], m2 add r2, 64 %endif sub r0, r7 %assign x x+32 %endrep %ifidn %1,pp lea r2, [r2 + r3 * 4 - 64] %else lea r2, [r2 + r3 * 4 - 128] %endif add r0, r7 dec r6d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_AVX2_64xN pp, 64 FILTER_VER_CHROMA_AVX2_64xN pp, 48 FILTER_VER_CHROMA_AVX2_64xN pp, 32 FILTER_VER_CHROMA_AVX2_64xN pp, 16 FILTER_VER_CHROMA_AVX2_64xN ps, 64 FILTER_VER_CHROMA_AVX2_64xN ps, 48 FILTER_VER_CHROMA_AVX2_64xN ps, 32 FILTER_VER_CHROMA_AVX2_64xN ps, 16 ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- %macro FILTER_V4_W16n_H2 2 INIT_XMM sse4 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 mov r4d, r4m sub r0, r1 %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: mov r6d, %1/16 .loopW: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m6, [r5 + r1] punpckhbw m7, m5, m6 pmaddubsw m7, m0 paddw m2, m7 punpcklbw m7, m5, m6 pmaddubsw m7, m0 paddw m4, m7 mova m7, [pw_512] pmulhrsw m4, m7 pmulhrsw m2, m7 packuswb m4, m2 movu [r2], m4 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m5, [r5 + 2 * r1] punpcklbw m2, m6, m5 punpckhbw m6, m5 pmaddubsw m2, m0 pmaddubsw m6, m0 paddw m4, m2 paddw m3, m6 pmulhrsw m4, m7 pmulhrsw m3, m7 packuswb m4, m3 movu [r2 + r3], m4 add r0, 16 add r2, 16 dec r6d jnz .loopW lea r0, [r0 + r1 * 2 - %1] lea r2, [r2 + r3 * 2 - %1] dec r4d jnz .loop RET %endmacro FILTER_V4_W16n_H2 64, 64 FILTER_V4_W16n_H2 64, 32 FILTER_V4_W16n_H2 64, 48 FILTER_V4_W16n_H2 48, 64 FILTER_V4_W16n_H2 64, 16 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_2xN 1 INIT_XMM sse4 cglobal filterPixelToShort_2x%1, 3, 4, 3 mov r3d, r3m add r3d, r3d ; load constant mova m1, [pb_128] mova m2, [tab_c_64_n64] %rep %1/2 movd m0, [r0] pinsrd m0, [r0 + r1], 1 punpcklbw m0, m1 pmaddubsw m0, m2 movd [r2 + r3 * 0], m0 pextrd [r2 + r3 * 1], m0, 2 lea r0, [r0 + r1 * 2] lea r2, [r2 + r3 * 2] %endrep RET %endmacro P2S_H_2xN 4 P2S_H_2xN 8 P2S_H_2xN 16 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_4xN 1 INIT_XMM sse4 cglobal filterPixelToShort_4x%1, 3, 6, 4 mov r3d, r3m add r3d, r3d lea r4, [r3 * 3] lea r5, [r1 * 3] ; load constant mova m2, [pb_128] mova m3, [tab_c_64_n64] %assign x 0 %rep %1/4 movd m0, [r0] pinsrd m0, [r0 + r1], 1 punpcklbw m0, m2 pmaddubsw m0, m3 movd m1, [r0 + r1 * 2] pinsrd m1, [r0 + r5], 1 punpcklbw m1, m2 pmaddubsw m1, m3 movq [r2 + r3 * 0], m0 movq [r2 + r3 * 2], m1 movhps [r2 + r3 * 1], m0 movhps [r2 + r4], m1 %assign x x+1 %if (x != %1/4) lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] %endif %endrep RET %endmacro P2S_H_4xN 4 P2S_H_4xN 8 P2S_H_4xN 16 P2S_H_4xN 32 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_6xN 1 INIT_XMM sse4 cglobal filterPixelToShort_6x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] ; load height mov r6d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r4] punpcklbw m3, m4 pmaddubsw m3, m5 movh [r2 + r3 * 0], m0 pextrd [r2 + r3 * 0 + 8], m0, 2 movh [r2 + r3 * 1], m1 pextrd [r2 + r3 * 1 + 8], m1, 2 movh [r2 + r3 * 2], m2 pextrd [r2 + r3 * 2 + 8], m2, 2 movh [r2 + r5], m3 pextrd [r2 + r5 + 8], m3, 2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_6xN 8 P2S_H_6xN 16 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_8xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_8x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load height mov r4d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0], m0 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6 ], m3 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jnz .loop RET %endmacro P2S_H_8xN 8 P2S_H_8xN 4 P2S_H_8xN 16 P2S_H_8xN 32 P2S_H_8xN 12 P2S_H_8xN 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_XMM ssse3 cglobal filterPixelToShort_8x6, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r1 * 5] lea r6, [r3 * 3] ; load constant mova m3, [pb_128] mova m4, [tab_c_64_n64] movh m0, [r0] punpcklbw m0, m3 pmaddubsw m0, m4 movh m1, [r0 + r1] punpcklbw m1, m3 pmaddubsw m1, m4 movh m2, [r0 + r1 * 2] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r3 * 0], m0 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 2], m2 movh m0, [r0 + r4] punpcklbw m0, m3 pmaddubsw m0, m4 movh m1, [r0 + r1 * 4] punpcklbw m1, m3 pmaddubsw m1, m4 movh m2, [r0 + r5] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r6 ], m0 movu [r2 + r3 * 4], m1 lea r2, [r2 + r3 * 4] movu [r2 + r3], m2 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_16xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_16x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r3 * 3] lea r5, [r1 * 3] ; load height mov r6d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0], m0 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 2], m2 movu [r2 + r4], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 16], m0 movu [r2 + r3 * 1 + 16], m1 movu [r2 + r3 * 2 + 16], m2 movu [r2 + r4 + 16], m3 lea r0, [r0 + r1 * 4 - 8] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_16xN 16 P2S_H_16xN 4 P2S_H_16xN 8 P2S_H_16xN 12 P2S_H_16xN 32 P2S_H_16xN 64 P2S_H_16xN 24 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_16x4, 3, 4, 2 mov r3d, r3m add r3d, r3d ; load constant vbroadcasti128 m1, [pw_2000] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 lea r1, [r1 * 3] lea r3, [r3 * 3] pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_16x8, 3, 6, 2 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] ; load constant vbroadcasti128 m1, [pw_2000] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_16x12, 3, 6, 2 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] ; load constant vbroadcasti128 m1, [pw_2000] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_16x16, 3, 6, 2 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] ; load constant vbroadcasti128 m1, [pw_2000] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_16x24, 3, 7, 2 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] mov r6d, 3 ; load constant vbroadcasti128 m1, [pw_2000] .loop: pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_16xN_avx2 1 INIT_YMM avx2 cglobal filterPixelToShort_16x%1, 3, 7, 2 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] mov r6d, %1/16 ; load constant vbroadcasti128 m1, [pw_2000] .loop: pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_16xN_avx2 32 P2S_H_16xN_avx2 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_32xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_32x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r3 * 3] lea r5, [r1 * 3] ; load height mov r6d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0], m0 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 2], m2 movu [r2 + r4], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 16], m0 movu [r2 + r3 * 1 + 16], m1 movu [r2 + r3 * 2 + 16], m2 movu [r2 + r4 + 16], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 32], m0 movu [r2 + r3 * 1 + 32], m1 movu [r2 + r3 * 2 + 32], m2 movu [r2 + r4 + 32], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 48], m0 movu [r2 + r3 * 1 + 48], m1 movu [r2 + r3 * 2 + 48], m2 movu [r2 + r4 + 48], m3 lea r0, [r0 + r1 * 4 - 24] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_32xN 32 P2S_H_32xN 8 P2S_H_32xN 16 P2S_H_32xN 24 P2S_H_32xN 64 P2S_H_32xN 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_32xN_avx2 1 INIT_YMM avx2 cglobal filterPixelToShort_32x%1, 3, 7, 3 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load height mov r4d, %1/4 ; load constant vpbroadcastd m2, [pw_2000] .loop: pmovzxbw m0, [r0 + 0 * mmsize/2] pmovzxbw m1, [r0 + 1 * mmsize/2] psllw m0, 6 psllw m1, 6 psubw m0, m2 psubw m1, m2 movu [r2 + 0 * mmsize], m0 movu [r2 + 1 * mmsize], m1 pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] psllw m0, 6 psllw m1, 6 psubw m0, m2 psubw m1, m2 movu [r2 + r3 + 0 * mmsize], m0 movu [r2 + r3 + 1 * mmsize], m1 pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] psllw m0, 6 psllw m1, 6 psubw m0, m2 psubw m1, m2 movu [r2 + r3 * 2 + 0 * mmsize], m0 movu [r2 + r3 * 2 + 1 * mmsize], m1 pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] psllw m0, 6 psllw m1, 6 psubw m0, m2 psubw m1, m2 movu [r2 + r6 + 0 * mmsize], m0 movu [r2 + r6 + 1 * mmsize], m1 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jnz .loop RET %endmacro P2S_H_32xN_avx2 32 P2S_H_32xN_avx2 8 P2S_H_32xN_avx2 16 P2S_H_32xN_avx2 24 P2S_H_32xN_avx2 64 P2S_H_32xN_avx2 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_64xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_64x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r3 * 3] lea r5, [r1 * 3] ; load height mov r6d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop: movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0], m0 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 2], m2 movu [r2 + r4], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 16], m0 movu [r2 + r3 * 1 + 16], m1 movu [r2 + r3 * 2 + 16], m2 movu [r2 + r4 + 16], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 32], m0 movu [r2 + r3 * 1 + 32], m1 movu [r2 + r3 * 2 + 32], m2 movu [r2 + r4 + 32], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 48], m0 movu [r2 + r3 * 1 + 48], m1 movu [r2 + r3 * 2 + 48], m2 movu [r2 + r4 + 48], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 64], m0 movu [r2 + r3 * 1 + 64], m1 movu [r2 + r3 * 2 + 64], m2 movu [r2 + r4 + 64], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 80], m0 movu [r2 + r3 * 1 + 80], m1 movu [r2 + r3 * 2 + 80], m2 movu [r2 + r4 + 80], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 96], m0 movu [r2 + r3 * 1 + 96], m1 movu [r2 + r3 * 2 + 96], m2 movu [r2 + r4 + 96], m3 lea r0, [r0 + 8] movh m0, [r0] punpcklbw m0, m4 pmaddubsw m0, m5 movh m1, [r0 + r1] punpcklbw m1, m4 pmaddubsw m1, m5 movh m2, [r0 + r1 * 2] punpcklbw m2, m4 pmaddubsw m2, m5 movh m3, [r0 + r5] punpcklbw m3, m4 pmaddubsw m3, m5 movu [r2 + r3 * 0 + 112], m0 movu [r2 + r3 * 1 + 112], m1 movu [r2 + r3 * 2 + 112], m2 movu [r2 + r4 + 112], m3 lea r0, [r0 + r1 * 4 - 56] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_64xN 64 P2S_H_64xN 16 P2S_H_64xN 32 P2S_H_64xN 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_64xN_avx2 1 INIT_YMM avx2 cglobal filterPixelToShort_64x%1, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load height mov r4d, %1/4 ; load constant vpbroadcastd m4, [pw_2000] .loop: pmovzxbw m0, [r0 + 0 * mmsize/2] pmovzxbw m1, [r0 + 1 * mmsize/2] pmovzxbw m2, [r0 + 2 * mmsize/2] pmovzxbw m3, [r0 + 3 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psllw m3, 6 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 movu [r2 + 0 * mmsize], m0 movu [r2 + 1 * mmsize], m1 movu [r2 + 2 * mmsize], m2 movu [r2 + 3 * mmsize], m3 pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] pmovzxbw m3, [r0 + r1 + 3 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psllw m3, 6 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 movu [r2 + r3 + 0 * mmsize], m0 movu [r2 + r3 + 1 * mmsize], m1 movu [r2 + r3 + 2 * mmsize], m2 movu [r2 + r3 + 3 * mmsize], m3 pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] pmovzxbw m3, [r0 + r1 * 2 + 3 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psllw m3, 6 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 movu [r2 + r3 * 2 + 0 * mmsize], m0 movu [r2 + r3 * 2 + 1 * mmsize], m1 movu [r2 + r3 * 2 + 2 * mmsize], m2 movu [r2 + r3 * 2 + 3 * mmsize], m3 pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] pmovzxbw m3, [r0 + r5 + 3 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psllw m3, 6 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 movu [r2 + r6 + 0 * mmsize], m0 movu [r2 + r6 + 1 * mmsize], m1 movu [r2 + r6 + 2 * mmsize], m2 movu [r2 + r6 + 3 * mmsize], m3 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jnz .loop RET %endmacro P2S_H_64xN_avx2 64 P2S_H_64xN_avx2 16 P2S_H_64xN_avx2 32 P2S_H_64xN_avx2 48 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_12xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_12x%1, 3, 7, 6 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r6, [r3 * 3] mov r5d, %1/4 ; load constant mova m4, [pb_128] mova m5, [tab_c_64_n64] .loop: movu m0, [r0] punpcklbw m1, m0, m4 punpckhbw m0, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 movu m2, [r0 + r1] punpcklbw m3, m2, m4 punpckhbw m2, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 movu [r2 + r3 * 0], m1 movu [r2 + r3 * 1], m3 movh [r2 + r3 * 0 + 16], m0 movh [r2 + r3 * 1 + 16], m2 movu m0, [r0 + r1 * 2] punpcklbw m1, m0, m4 punpckhbw m0, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 movu m2, [r0 + r4] punpcklbw m3, m2, m4 punpckhbw m2, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 movu [r2 + r3 * 2], m1 movu [r2 + r6], m3 movh [r2 + r3 * 2 + 16], m0 movh [r2 + r6 + 16], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r5d jnz .loop RET %endmacro P2S_H_12xN 16 P2S_H_12xN 32 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_24xN 1 INIT_XMM ssse3 cglobal filterPixelToShort_24x%1, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] mov r6d, %1/4 ; load constant mova m3, [pb_128] mova m4, [tab_c_64_n64] .loop: movu m0, [r0] punpcklbw m1, m0, m3 punpckhbw m0, m3 pmaddubsw m0, m4 pmaddubsw m1, m4 movu m2, [r0 + 16] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r3 * 0], m1 movu [r2 + r3 * 0 + 16], m0 movu [r2 + r3 * 0 + 32], m2 movu m0, [r0 + r1] punpcklbw m1, m0, m3 punpckhbw m0, m3 pmaddubsw m0, m4 pmaddubsw m1, m4 movu m2, [r0 + r1 + 16] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 1 + 16], m0 movu [r2 + r3 * 1 + 32], m2 movu m0, [r0 + r1 * 2] punpcklbw m1, m0, m3 punpckhbw m0, m3 pmaddubsw m0, m4 pmaddubsw m1, m4 movu m2, [r0 + r1 * 2 + 16] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r3 * 2], m1 movu [r2 + r3 * 2 + 16], m0 movu [r2 + r3 * 2 + 32], m2 movu m0, [r0 + r4] punpcklbw m1, m0, m3 punpckhbw m0, m3 pmaddubsw m0, m4 pmaddubsw m1, m4 movu m2, [r0 + r4 + 16] punpcklbw m2, m3 pmaddubsw m2, m4 movu [r2 + r5], m1 movu [r2 + r5 + 16], m0 movu [r2 + r5 + 32], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_24xN 32 P2S_H_24xN 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- %macro P2S_H_24xN_avx2 1 INIT_YMM avx2 cglobal filterPixelToShort_24x%1, 3, 7, 4 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] mov r6d, %1/4 ; load constant vpbroadcastd m1, [pw_2000] vpbroadcastd m2, [pb_128] vpbroadcastd m3, [tab_c_64_n64] .loop: pmovzxbw m0, [r0] psllw m0, 6 psubw m0, m1 movu [r2], m0 movu m0, [r0 + mmsize/2] punpcklbw m0, m2 pmaddubsw m0, m3 movu [r2 + r3 * 0 + mmsize], xm0 pmovzxbw m0, [r0 + r1] psllw m0, 6 psubw m0, m1 movu [r2 + r3], m0 movu m0, [r0 + r1 + mmsize/2] punpcklbw m0, m2 pmaddubsw m0, m3 movu [r2 + r3 * 1 + mmsize], xm0 pmovzxbw m0, [r0 + r1 * 2] psllw m0, 6 psubw m0, m1 movu [r2 + r3 * 2], m0 movu m0, [r0 + r1 * 2 + mmsize/2] punpcklbw m0, m2 pmaddubsw m0, m3 movu [r2 + r3 * 2 + mmsize], xm0 pmovzxbw m0, [r0 + r4] psllw m0, 6 psubw m0, m1 movu [r2 + r5], m0 movu m0, [r0 + r4 + mmsize/2] punpcklbw m0, m2 pmaddubsw m0, m3 movu [r2 + r5 + mmsize], xm0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET %endmacro P2S_H_24xN_avx2 32 P2S_H_24xN_avx2 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_XMM ssse3 cglobal filterPixelToShort_48x64, 3, 7, 4 mov r3d, r3m add r3d, r3d lea r4, [r1 * 3] lea r5, [r3 * 3] mov r6d, 16 ; load constant mova m2, [pb_128] mova m3, [tab_c_64_n64] .loop: movu m0, [r0] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 0], m1 movu [r2 + r3 * 0 + 16], m0 movu m0, [r0 + 16] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 0 + 32], m1 movu [r2 + r3 * 0 + 48], m0 movu m0, [r0 + 32] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 0 + 64], m1 movu [r2 + r3 * 0 + 80], m0 movu m0, [r0 + r1] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 1], m1 movu [r2 + r3 * 1 + 16], m0 movu m0, [r0 + r1 + 16] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 1 + 32], m1 movu [r2 + r3 * 1 + 48], m0 movu m0, [r0 + r1 + 32] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 1 + 64], m1 movu [r2 + r3 * 1 + 80], m0 movu m0, [r0 + r1 * 2] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 2], m1 movu [r2 + r3 * 2 + 16], m0 movu m0, [r0 + r1 * 2 + 16] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 2 + 32], m1 movu [r2 + r3 * 2 + 48], m0 movu m0, [r0 + r1 * 2 + 32] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r3 * 2 + 64], m1 movu [r2 + r3 * 2 + 80], m0 movu m0, [r0 + r4] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r5], m1 movu [r2 + r5 + 16], m0 movu m0, [r0 + r4 + 16] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r5 + 32], m1 movu [r2 + r5 + 48], m0 movu m0, [r0 + r4 + 32] punpcklbw m1, m0, m2 punpckhbw m0, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 movu [r2 + r5 + 64], m1 movu [r2 + r5 + 80], m0 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r6d jnz .loop RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_YMM avx2 cglobal filterPixelToShort_48x64, 3,7,4 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load height mov r4d, 64/4 ; load constant vpbroadcastd m3, [pw_2000] ; just unroll(1) because it is best choice for 48x64 .loop: pmovzxbw m0, [r0 + 0 * mmsize/2] pmovzxbw m1, [r0 + 1 * mmsize/2] pmovzxbw m2, [r0 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psubw m0, m3 psubw m1, m3 psubw m2, m3 movu [r2 + 0 * mmsize], m0 movu [r2 + 1 * mmsize], m1 movu [r2 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r1 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 + 1 * mmsize/2] pmovzxbw m2, [r0 + r1 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psubw m0, m3 psubw m1, m3 psubw m2, m3 movu [r2 + r3 + 0 * mmsize], m0 movu [r2 + r3 + 1 * mmsize], m1 movu [r2 + r3 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r1 * 2 + 0 * mmsize/2] pmovzxbw m1, [r0 + r1 * 2 + 1 * mmsize/2] pmovzxbw m2, [r0 + r1 * 2 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psubw m0, m3 psubw m1, m3 psubw m2, m3 movu [r2 + r3 * 2 + 0 * mmsize], m0 movu [r2 + r3 * 2 + 1 * mmsize], m1 movu [r2 + r3 * 2 + 2 * mmsize], m2 pmovzxbw m0, [r0 + r5 + 0 * mmsize/2] pmovzxbw m1, [r0 + r5 + 1 * mmsize/2] pmovzxbw m2, [r0 + r5 + 2 * mmsize/2] psllw m0, 6 psllw m1, 6 psllw m2, 6 psubw m0, m3 psubw m1, m3 psubw m2, m3 movu [r2 + r6 + 0 * mmsize], m0 movu [r2 + r6 + 1 * mmsize], m1 movu [r2 + r6 + 2 * mmsize], m2 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] dec r4d jnz .loop RET %macro PROCESS_LUMA_W4_4R 0 movd m0, [r0] movd m1, [r0 + r1] punpcklbw m2, m0, m1 ; m2=[0 1] lea r0, [r0 + 2 * r1] movd m0, [r0] punpcklbw m1, m0 ; m1=[1 2] punpcklqdq m2, m1 ; m2=[0 1 1 2] pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] movd m1, [r0 + r1] punpcklbw m5, m0, m1 ; m2=[2 3] lea r0, [r0 + 2 * r1] movd m0, [r0] punpcklbw m1, m0 ; m1=[3 4] punpcklqdq m5, m1 ; m5=[2 3 3 4] pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 movd m1, [r0 + r1] punpcklbw m2, m0, m1 ; m2=[4 5] lea r0, [r0 + 2 * r1] movd m0, [r0] punpcklbw m1, m0 ; m1=[5 6] punpcklqdq m2, m1 ; m2=[4 5 5 6] pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 movd m1, [r0 + r1] punpcklbw m2, m0, m1 ; m2=[6 7] lea r0, [r0 + 2 * r1] movd m0, [r0] punpcklbw m1, m0 ; m1=[7 8] punpcklqdq m2, m1 ; m2=[6 7 7 8] pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 movd m1, [r0 + r1] punpcklbw m2, m0, m1 ; m2=[8 9] movd m0, [r0 + 2 * r1] punpcklbw m1, m0 ; m1=[9 10] punpcklqdq m2, m1 ; m2=[8 9 9 10] pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end %endmacro %macro PROCESS_LUMA_W8_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m0, [r0] punpcklbw m1, m0 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 pmaddubsw m0, [r6 + 1 * 16] paddw m7, m0 ;m7=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m0, [r0] punpcklbw m1, m0 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 pmaddubsw m1, [r6 + 1 * 16] paddw m6, m1 ;m6 = [1+2+3+4] Row2 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m2, m0, [r6 + 1 * 16] pmaddubsw m0, [r6 + 2 * 16] paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 paddw m5, m2 ;m5=[2+3+4+5] Row3 lea r0, [r0 + 2 * r1] movq m0, [r0] punpcklbw m1, m0 pmaddubsw m2, m1, [r6 + 1 * 16] pmaddubsw m1, [r6 + 2 * 16] paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 paddw m4, m2 ;m4=[3+4+5+6] Row4 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m2, m0, [r6 + 2 * 16] pmaddubsw m0, [r6 + 3 * 16] paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 lea r0, [r0 + 2 * r1] movq m0, [r0] punpcklbw m1, m0 pmaddubsw m2, m1, [r6 + 2 * 16] pmaddubsw m1, [r6 + 3 * 16] paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 movq m1, [r0 + r1] punpcklbw m0, m1 pmaddubsw m0, [r6 + 3 * 16] paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end movq m0, [r0 + 2 * r1] punpcklbw m1, m0 pmaddubsw m1, [r6 + 3 * 16] paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_4xN 3 INIT_XMM sse4 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 lea r5, [3 * r1] sub r0, r5 shl r4d, 6 %ifidn %3,ps add r3d, r3d %endif %ifdef PIC lea r5, [tab_LumaCoeffVer] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffVer + r4] %endif %ifidn %3,pp mova m3, [pw_512] %else mova m3, [pw_2000] %endif mov r4d, %2/4 lea r5, [4 * r1] .loopH: PROCESS_LUMA_W4_4R %ifidn %3,pp pmulhrsw m4, m3 pmulhrsw m5, m3 packuswb m4, m5 movd [r2], m4 pextrd [r2 + r3], m4, 1 lea r2, [r2 + 2 * r3] pextrd [r2], m4, 2 pextrd [r2 + r3], m4, 3 %else psubw m4, m3 psubw m5, m3 movlps [r2], m4 movhps [r2 + r3], m4 lea r2, [r2 + 2 * r3] movlps [r2], m5 movhps [r2 + r3], m5 %endif sub r0, r5 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro INIT_YMM avx2 cglobal interp_8tap_vert_pp_4x4, 4,6,8 mov r4d, r4m lea r5, [r1 * 3] sub r0, r5 ; TODO: VPGATHERDD movd xm1, [r0] ; m1 = row0 movd xm2, [r0 + r1] ; m2 = row1 punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] movd xm3, [r0 + r1 * 2] ; m3 = row2 punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] movd xm4, [r0 + r5] punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] lea r0, [r0 + r1 * 4] movd xm5, [r0] ; m5 = row4 punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] movd xm2, [r0 + r1] ; m2 = row5 punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] movd xm6, [r0 + r1 * 2] ; m6 = row6 punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] movd xm4, [r0 + r5] ; m4 = row7 punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] lea r0, [r0 + r1 * 4] movd xm7, [r0] ; m7 = row8 punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] movd xm2, [r0 + r1] ; m2 = row9 punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] movd xm7, [r0 + r1 * 2] ; m7 = rowA punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] ; load filter coeff %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8 + 0] vpbroadcastd m2, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] %endif pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddubsw m5, m2 pmaddubsw m6, m2 vbroadcasti128 m0, [pw_1] pmaddwd m1, m0 pmaddwd m3, m0 pmaddwd m5, m0 pmaddwd m6, m0 paddd m1, m5 ; m1 = DQWORD ROW[1 0] paddd m3, m6 ; m3 = DQWORD ROW[3 2] packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] ; TODO: does it overflow? pmulhrsw m1, [pw_512] vextracti128 xm2, m1, 1 packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] movd [r2], xm1 pextrd [r2 + r3], xm1, 2 pextrd [r2 + r3 * 2], xm1, 1 lea r4, [r3 * 3] pextrd [r2 + r4], xm1, 3 RET INIT_YMM avx2 cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 add r3d, r3d movd xm1, [r0] pinsrd xm1, [r0 + r1], 1 pinsrd xm1, [r0 + r1 * 2], 2 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] lea r0, [r0 + r1 * 4] movd xm2, [r0] pinsrd xm2, [r0 + r1], 1 pinsrd xm2, [r0 + r1 * 2], 2 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] lea r0, [r0 + r1 * 4] movd xm3, [r0] pinsrd xm3, [r0 + r1], 1 pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] mova m3, [interp4_vpp_shuf1] vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] mova m3, [interp4_vpp_shuf1 + mmsize] vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] mova m3, [interp4_vpp_shuf] pshufb m0, m0, m3 pshufb m1, m1, m3 pshufb m4, m4, m3 pshufb m2, m2, m3 pmaddubsw m0, [r5] pmaddubsw m1, [r5 + mmsize] pmaddubsw m4, [r5 + 2 * mmsize] pmaddubsw m2, [r5 + 3 * mmsize] paddw m0, m1 paddw m0, m4 paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] psubw m0, [pw_2000] vextracti128 xm2, m0, 1 lea r5, [r3 * 3] movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r5], xm2 RET %macro FILTER_VER_LUMA_AVX2_4xN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 lea r6, [r1 * 4] %ifidn %3,pp mova m6, [pw_512] %else add r3d, r3d vbroadcasti128 m6, [pw_2000] %endif lea r8, [r3 * 3] mova m5, [interp4_vpp_shuf] mova m0, [interp4_vpp_shuf1] mova m7, [interp4_vpp_shuf1 + mmsize] mov r7d, %2 / 8 .loop: movd xm1, [r0] pinsrd xm1, [r0 + r1], 1 pinsrd xm1, [r0 + r1 * 2], 2 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] lea r0, [r0 + r1 * 4] movd xm2, [r0] pinsrd xm2, [r0 + r1], 1 pinsrd xm2, [r0 + r1 * 2], 2 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] lea r0, [r0 + r1 * 4] movd xm3, [r0] pinsrd xm3, [r0 + r1], 1 pinsrd xm3, [r0 + r1 * 2], 2 pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] lea r0, [r0 + r1 * 4] movd xm4, [r0] pinsrd xm4, [r0 + r1], 1 pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] pshufb m8, m8, m5 pshufb m1, m1, m5 pshufb m4, m4, m5 pshufb m9, m9, m5 pshufb m2, m2, m5 pshufb m3, m3, m5 pmaddubsw m8, [r5] pmaddubsw m1, [r5 + mmsize] pmaddubsw m9, [r5 + 2 * mmsize] pmaddubsw m3, [r5 + 3 * mmsize] paddw m8, m1 paddw m9, m3 pmaddubsw m1, m4, [r5 + 2 * mmsize] pmaddubsw m3, m2, [r5 + 3 * mmsize] pmaddubsw m4, [r5] pmaddubsw m2, [r5 + mmsize] paddw m3, m1 paddw m2, m4 paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] %ifidn %3,pp pmulhrsw m8, m6 pmulhrsw m9, m6 packuswb m8, m9 vextracti128 xm1, m8, 1 movd [r2], xm8 pextrd [r2 + r3], xm8, 1 movd [r2 + r3 * 2], xm1 pextrd [r2 + r8], xm1, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm8, 2 pextrd [r2 + r3], xm8, 3 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r8], xm1, 3 %else psubw m8, m6 psubw m9, m6 vextracti128 xm1, m8, 1 vextracti128 xm2, m9, 1 movq [r2], xm8 movhps [r2 + r3], xm8 movq [r2 + r3 * 2], xm1 movhps [r2 + r8], xm1 lea r2, [r2 + r3 * 4] movq [r2], xm9 movhps [r2 + r3], xm9 movq [r2 + r3 * 2], xm2 movhps [r2 + r8], xm2 %endif lea r2, [r2 + r3 * 4] sub r0, r6 dec r7d jnz .loop RET %endif %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 4, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 8, pp FILTER_VER_LUMA_AVX2_4xN 4, 8, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 16, pp FILTER_VER_LUMA_AVX2_4xN 4, 16, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 4, ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 8, ps FILTER_VER_LUMA_AVX2_4xN 4, 8, ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_4xN 4, 16, ps FILTER_VER_LUMA_AVX2_4xN 4, 16, ps %macro PROCESS_LUMA_AVX2_W8_8R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m5, m3 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] pmaddubsw m3, m4, [r5 + 3 * mmsize] paddw m5, m3 pmaddubsw m3, m4, [r5 + 2 * mmsize] paddw m2, m3 pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] movq xm3, [r0 + r1] ; m3 = row 9 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] pmaddubsw m3, m0, [r5 + 3 * mmsize] paddw m2, m3 pmaddubsw m3, m0, [r5 + 2 * mmsize] paddw m1, m3 pmaddubsw m0, [r5 + 1 * mmsize] paddw m4, m0 movq xm3, [r0 + r4] ; m3 = row 11 punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 12 punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] pmaddubsw m3, m6, [r5 + 3 * mmsize] paddw m1, m3 pmaddubsw m6, [r5 + 2 * mmsize] paddw m4, m6 movq xm3, [r0 + r1] ; m3 = row 13 punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] movq xm6, [r0 + r1 * 2] ; m6 = row 14 punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] pmaddubsw m0, [r5 + 3 * mmsize] paddw m4, m0 %endmacro %macro PROCESS_LUMA_AVX2_W8_4R 0 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] lea r0, [r0 + r1 * 4] movq xm1, [r0] ; m1 = row 4 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r0 + r1] ; m3 = row 5 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] movq xm4, [r0 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m5, m3 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 movq xm3, [r0 + r4] ; m3 = row 7 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] lea r0, [r0 + r1 * 4] movq xm0, [r0] ; m0 = row 8 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] pmaddubsw m3, m4, [r5 + 3 * mmsize] paddw m5, m3 pmaddubsw m3, m4, [r5 + 2 * mmsize] paddw m2, m3 movq xm3, [r0 + r1] ; m3 = row 9 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] movq xm6, [r0 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] pmaddubsw m3, m0, [r5 + 3 * mmsize] paddw m2, m3 %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_8xN 3 INIT_XMM sse4 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 lea r5, [3 * r1] sub r0, r5 shl r4d, 6 %ifidn %3,ps add r3d, r3d %endif %ifdef PIC lea r5, [tab_LumaCoeffVer] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffVer + r4] %endif %ifidn %3,pp mova m3, [pw_512] %else mova m3, [pw_2000] %endif mov r4d, %2/4 lea r5, [4 * r1] .loopH: PROCESS_LUMA_W8_4R %ifidn %3,pp pmulhrsw m7, m3 pmulhrsw m6, m3 pmulhrsw m5, m3 pmulhrsw m4, m3 packuswb m7, m6 packuswb m5, m4 movlps [r2], m7 movhps [r2 + r3], m7 lea r2, [r2 + 2 * r3] movlps [r2], m5 movhps [r2 + r3], m5 %else psubw m7, m3 psubw m6, m3 psubw m5, m3 psubw m4, m3 movu [r2], m7 movu [r2 + r3], m6 lea r2, [r2 + 2 * r3] movu [r2], m5 movu [r2 + r3], m4 %endif sub r0, r5 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro %macro FILTER_VER_LUMA_AVX2_8xN 3 INIT_YMM avx2 cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 lea r6, [r1 * 4] %ifidn %3,pp mova m7, [pw_512] %else add r3d, r3d vbroadcasti128 m7, [pw_2000] %endif mov word [rsp], %2 / 8 .loop: PROCESS_LUMA_AVX2_W8_8R %ifidn %3,pp pmulhrsw m5, m7 ; m5 = word: row 0, row 1 pmulhrsw m2, m7 ; m2 = word: row 2, row 3 pmulhrsw m1, m7 ; m1 = word: row 4, row 5 pmulhrsw m4, m7 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 lea r2, [r2 + r3 * 2] movhps [r2], xm5 movhps [r2 + r3], xm2 lea r2, [r2 + r3 * 2] movq [r2], xm1 movq [r2 + r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm1 movhps [r2 + r3], xm4 %else psubw m5, m7 ; m5 = word: row 0, row 1 psubw m2, m7 ; m2 = word: row 2, row 3 psubw m1, m7 ; m1 = word: row 4, row 5 psubw m4, m7 ; m4 = word: row 6, row 7 vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movu [r2], xm5 movu [r2 + r3], xm6 lea r2, [r2 + r3 * 2] movu [r2], xm2 movu [r2 + r3], xm3 lea r2, [r2 + r3 * 2] movu [r2], xm1 movu [r2 + r3], xm0 lea r2, [r2 + r3 * 2] movu [r2], xm4 vextracti128 xm4, m4, 1 movu [r2 + r3], xm4 %endif lea r2, [r2 + r3 * 2] sub r0, r6 dec word [rsp] jnz .loop RET %endmacro %macro FILTER_VER_LUMA_AVX2_8x8 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 PROCESS_LUMA_AVX2_W8_8R %ifidn %1,pp mova m3, [pw_512] %else add r3d, r3d vbroadcasti128 m3, [pw_2000] %endif lea r4, [r3 * 3] %ifidn %1,pp pmulhrsw m5, m3 ; m5 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 pmulhrsw m1, m3 ; m1 = word: row 4, row 5 pmulhrsw m4, m3 ; m4 = word: row 6, row 7 packuswb m5, m2 packuswb m1, m4 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm1 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r4], xm4 %else psubw m5, m3 ; m5 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 psubw m1, m3 ; m1 = word: row 4, row 5 psubw m4, m3 ; m4 = word: row 6, row 7 vextracti128 xm6, m5, 1 vextracti128 xm3, m2, 1 vextracti128 xm0, m1, 1 movu [r2], xm5 movu [r2 + r3], xm6 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm4 vextracti128 xm4, m4, 1 movu [r2 + r4], xm4 %endif RET %endmacro %macro FILTER_VER_LUMA_AVX2_8x4 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 PROCESS_LUMA_AVX2_W8_4R %ifidn %1,pp mova m3, [pw_512] %else add r3d, r3d vbroadcasti128 m3, [pw_2000] %endif lea r4, [r3 * 3] %ifidn %1,pp pmulhrsw m5, m3 ; m5 = word: row 0, row 1 pmulhrsw m2, m3 ; m2 = word: row 2, row 3 packuswb m5, m2 vextracti128 xm2, m5, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r4], xm2 %else psubw m5, m3 ; m5 = word: row 0, row 1 psubw m2, m3 ; m2 = word: row 2, row 3 movu [r2], xm5 vextracti128 xm5, m5, 1 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm2 vextracti128 xm2, m2, 1 movu [r2 + r4], xm2 %endif RET %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 4, pp FILTER_VER_LUMA_AVX2_8x4 pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 8, pp FILTER_VER_LUMA_AVX2_8x8 pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 16, pp FILTER_VER_LUMA_AVX2_8xN 8, 16, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 32, pp FILTER_VER_LUMA_AVX2_8xN 8, 32, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 4, ps FILTER_VER_LUMA_AVX2_8x4 ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 8, ps FILTER_VER_LUMA_AVX2_8x8 ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 16, ps FILTER_VER_LUMA_AVX2_8xN 8, 16, ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_8xN 8, 32, ps FILTER_VER_LUMA_AVX2_8xN 8, 32, ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_12xN 3 INIT_XMM sse4 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 lea r5, [3 * r1] sub r0, r5 shl r4d, 6 %ifidn %3,ps add r3d, r3d %endif %ifdef PIC lea r5, [tab_LumaCoeffVer] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffVer + r4] %endif %ifidn %3,pp mova m3, [pw_512] %else mova m3, [pw_2000] %endif mov r4d, %2/4 .loopH: PROCESS_LUMA_W8_4R %ifidn %3,pp pmulhrsw m7, m3 pmulhrsw m6, m3 pmulhrsw m5, m3 pmulhrsw m4, m3 packuswb m7, m6 packuswb m5, m4 movlps [r2], m7 movhps [r2 + r3], m7 lea r5, [r2 + 2 * r3] movlps [r5], m5 movhps [r5 + r3], m5 %else psubw m7, m3 psubw m6, m3 psubw m5, m3 psubw m4, m3 movu [r2], m7 movu [r2 + r3], m6 lea r5, [r2 + 2 * r3] movu [r5], m5 movu [r5 + r3], m4 %endif lea r5, [8 * r1 - 8] sub r0, r5 %ifidn %3,pp add r2, 8 %else add r2, 16 %endif PROCESS_LUMA_W4_4R %ifidn %3,pp pmulhrsw m4, m3 pmulhrsw m5, m3 packuswb m4, m5 movd [r2], m4 pextrd [r2 + r3], m4, 1 lea r5, [r2 + 2 * r3] pextrd [r5], m4, 2 pextrd [r5 + r3], m4, 3 %else psubw m4, m3 psubw m5, m3 movlps [r2], m4 movhps [r2 + r3], m4 lea r5, [r2 + 2 * r3] movlps [r5], m5 movhps [r5 + r3], m5 %endif lea r5, [4 * r1 + 8] sub r0, r5 %ifidn %3,pp lea r2, [r2 + 4 * r3 - 8] %else lea r2, [r2 + 4 * r3 - 16] %endif dec r4d jnz .loopH RET %endmacro ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_12xN 12, 16, pp ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_12xN 12, 16, ps %macro FILTER_VER_LUMA_AVX2_12x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 pmaddubsw m13, m11, [r5 + 1 * mmsize] paddw m9, m13 pmaddubsw m11, [r5] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movq [r2], xm0 pextrd [r2 + 8], xm0, 2 movq [r2 + r3], xm1 pextrd [r2 + r3 + 8], xm1, 2 movq [r2 + r3 * 2], xm2 pextrd [r2 + r3 * 2 + 8], xm2, 2 movq [r2 + r6], xm3 pextrd [r2 + r6 + 8], xm3, 2 lea r2, [r2 + r3 * 4] movq [r2], xm4 pextrd [r2 + 8], xm4, 2 movq [r2 + r3], xm5 pextrd [r2 + r3 + 8], xm5, 2 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], xm0 vextracti128 xm0, m0, 1 movq [r2 + 16], xm0 movu [r2 + r3], xm1 vextracti128 xm1, m1, 1 movq [r2 + r3 + 16], xm1 movu [r2 + r3 * 2], xm2 vextracti128 xm2, m2, 1 movq [r2 + r3 * 2 + 16], xm2 movu [r2 + r6], xm3 vextracti128 xm3, m3, 1 movq [r2 + r6 + 16], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 vextracti128 xm4, m4, 1 movq [r2 + 16], xm4 movu [r2 + r3], xm5 vextracti128 xm5, m5, 1 movq [r2 + r3 + 16], xm5 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 pmaddubsw m0, m12, [r5 + 2 * mmsize] paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 pmaddubsw m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 pmaddubsw m1, m13, [r5 + 2 * mmsize] paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 pmaddubsw m13, [r5] %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movq [r2 + r3 * 2], xm6 pextrd [r2 + r3 * 2 + 8], xm6, 2 movq [r2 + r6], xm7 pextrd [r2 + r6 + 8], xm7, 2 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2 + r3 * 2], xm6 vextracti128 xm6, m6, 1 movq [r2 + r3 * 2 + 16], xm6 movu [r2 + r6], xm7 vextracti128 xm7, m7, 1 movq [r2 + r6 + 16], xm7 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, [r5 + 3 * mmsize] paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 pmaddubsw m2, m0, [r5 + 1 * mmsize] paddw m12, m2 pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 pmaddubsw m3, m1, [r5 + 1 * mmsize] paddw m13, m3 pmaddubsw m1, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 pmaddubsw m4, m2, [r5 + 2 * mmsize] paddw m12, m4 pmaddubsw m2, [r5 + 1 * mmsize] paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 pmaddubsw m5, m3, [r5 + 2 * mmsize] paddw m13, m5 pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 movu xm5, [r0 + r4] ; m5 = row 19 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 3 * mmsize] paddw m12, m6 pmaddubsw m4, [r5 + 2 * mmsize] paddw m0, m4 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 20 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 3 * mmsize] paddw m13, m7 pmaddubsw m5, [r5 + 2 * mmsize] paddw m1, m5 movu xm7, [r0 + r1] ; m7 = row 21 punpckhbw xm2, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddubsw m6, [r5 + 3 * mmsize] paddw m0, m6 movu xm2, [r0 + r1 * 2] ; m2 = row 22 punpckhbw xm3, xm7, xm2 punpcklbw xm7, xm2 vinserti128 m7, m7, xm3, 1 pmaddubsw m7, [r5 + 3 * mmsize] paddw m1, m7 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m12, m14 ; m12 = word: row 12 pmulhrsw m13, m14 ; m13 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m12, m13 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 movq [r2], xm8 pextrd [r2 + 8], xm8, 2 movq [r2 + r3], xm9 pextrd [r2 + r3 + 8], xm9, 2 movq [r2 + r3 * 2], xm10 pextrd [r2 + r3 * 2 + 8], xm10, 2 movq [r2 + r6], xm11 pextrd [r2 + r6 + 8], xm11, 2 lea r2, [r2 + r3 * 4] movq [r2], xm12 pextrd [r2 + 8], xm12, 2 movq [r2 + r3], xm13 pextrd [r2 + r3 + 8], xm13, 2 movq [r2 + r3 * 2], xm0 pextrd [r2 + r3 * 2 + 8], xm0, 2 movq [r2 + r6], xm1 pextrd [r2 + r6 + 8], xm1, 2 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m12, m14 ; m12 = word: row 12 psubw m13, m14 ; m13 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r2], xm8 vextracti128 xm8, m8, 1 movq [r2 + 16], xm8 movu [r2 + r3], xm9 vextracti128 xm9, m9, 1 movq [r2 + r3 + 16], xm9 movu [r2 + r3 * 2], xm10 vextracti128 xm10, m10, 1 movq [r2 + r3 * 2 + 16], xm10 movu [r2 + r6], xm11 vextracti128 xm11, m11, 1 movq [r2 + r6 + 16], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm12 vextracti128 xm12, m12, 1 movq [r2 + 16], xm12 movu [r2 + r3], xm13 vextracti128 xm13, m13, 1 movq [r2 + r3 + 16], xm13 movu [r2 + r3 * 2], xm0 vextracti128 xm0, m0, 1 movq [r2 + r3 * 2 + 16], xm0 movu [r2 + r6], xm1 vextracti128 xm1, m1, 1 movq [r2 + r6 + 16], xm1 %endif RET %endif %endmacro FILTER_VER_LUMA_AVX2_12x16 pp FILTER_VER_LUMA_AVX2_12x16 ps %macro FILTER_VER_LUMA_AVX2_16x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 pmaddubsw m13, m11, [r5 + 1 * mmsize] paddw m9, m13 pmaddubsw m11, [r5] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 pmaddubsw m0, m12, [r5 + 2 * mmsize] paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 pmaddubsw m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 pmaddubsw m1, m13, [r5 + 2 * mmsize] paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 pmaddubsw m13, [r5] %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2 + r3 * 2], m6 movu [r2 + r6], m7 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, [r5 + 3 * mmsize] paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 pmaddubsw m2, m0, [r5 + 1 * mmsize] paddw m12, m2 pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 pmaddubsw m3, m1, [r5 + 1 * mmsize] paddw m13, m3 pmaddubsw m1, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 pmaddubsw m4, m2, [r5 + 2 * mmsize] paddw m12, m4 pmaddubsw m2, [r5 + 1 * mmsize] paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 pmaddubsw m5, m3, [r5 + 2 * mmsize] paddw m13, m5 pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 movu xm5, [r0 + r4] ; m5 = row 19 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 3 * mmsize] paddw m12, m6 pmaddubsw m4, [r5 + 2 * mmsize] paddw m0, m4 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 20 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 3 * mmsize] paddw m13, m7 pmaddubsw m5, [r5 + 2 * mmsize] paddw m1, m5 movu xm7, [r0 + r1] ; m7 = row 21 punpckhbw xm2, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddubsw m6, [r5 + 3 * mmsize] paddw m0, m6 movu xm2, [r0 + r1 * 2] ; m2 = row 22 punpckhbw xm3, xm7, xm2 punpcklbw xm7, xm2 vinserti128 m7, m7, xm3, 1 pmaddubsw m7, [r5 + 3 * mmsize] paddw m1, m7 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m12, m14 ; m12 = word: row 12 pmulhrsw m13, m14 ; m13 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m12, m13 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r6], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm12 movu [r2 + r3], xm13 movu [r2 + r3 * 2], xm0 movu [r2 + r6], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m12, m14 ; m12 = word: row 12 psubw m13, m14 ; m13 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 movu [r2 + r6], m11 lea r2, [r2 + r3 * 4] movu [r2], m12 movu [r2 + r3], m13 movu [r2 + r3 * 2], m0 movu [r2 + r6], m1 %endif RET %endif %endmacro FILTER_VER_LUMA_AVX2_16x16 pp FILTER_VER_LUMA_AVX2_16x16 ps %macro FILTER_VER_LUMA_AVX2_16x12 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 pmaddubsw m13, m11, [r5 + 1 * mmsize] paddw m9, m13 pmaddubsw m11, [r5] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 pmaddubsw m0, m12, [r5 + 2 * mmsize] paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 pmaddubsw m1, m13, [r5 + 2 * mmsize] paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2 + r3 * 2], m6 movu [r2 + r6], m7 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, [r5 + 3 * mmsize] paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 packuswb m8, m9 packuswb m10, m11 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r6], xm11 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 movu [r2 + r6], m11 %endif RET %endif %endmacro FILTER_VER_LUMA_AVX2_16x12 pp FILTER_VER_LUMA_AVX2_16x12 ps %macro FILTER_VER_LUMA_AVX2_16x8 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp mova m14, [pw_512] %else add r3d, r3d vbroadcasti128 m14, [pw_2000] %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 lea r4, [r3 * 3] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r4], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r2 + r3 * 2], xm6 movu [r2 + r4], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2 + r3 * 2], m6 movu [r2 + r4], m7 %endif RET %endif %endmacro FILTER_VER_LUMA_AVX2_16x8 pp FILTER_VER_LUMA_AVX2_16x8 ps %macro FILTER_VER_LUMA_AVX2_16x4 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,pp mova m12, [pw_512] %else add r3d, r3d vbroadcasti128 m12, [pw_2000] %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 %ifidn %1,pp pmulhrsw m0, m12 ; m0 = word: row 0 pmulhrsw m1, m12 ; m1 = word: row 1 pmulhrsw m2, m12 ; m2 = word: row 2 pmulhrsw m3, m12 ; m3 = word: row 3 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 lea r4, [r3 * 3] movu [r2 + r4], xm3 %else psubw m0, m12 ; m0 = word: row 0 psubw m1, m12 ; m1 = word: row 1 psubw m2, m12 ; m2 = word: row 2 psubw m3, m12 ; m3 = word: row 3 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 lea r4, [r3 * 3] movu [r2 + r4], m3 %endif RET %endif %endmacro FILTER_VER_LUMA_AVX2_16x4 pp FILTER_VER_LUMA_AVX2_16x4 ps %macro FILTER_VER_LUMA_AVX2_16xN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %3,ps add r3d, r3d vbroadcasti128 m14, [pw_2000] %else mova m14, [pw_512] %endif lea r6, [r3 * 3] lea r7, [r1 * 4] mov r8d, %2 / 16 .loop: movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 pmaddubsw m13, m11, [r5 + 1 * mmsize] paddw m9, m13 pmaddubsw m11, [r5] %ifidn %3,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 lea r2, [r2 + r3 * 4] movu [r2], m4 movu [r2 + r3], m5 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 pmaddubsw m0, m12, [r5 + 2 * mmsize] paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 pmaddubsw m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 pmaddubsw m1, m13, [r5 + 2 * mmsize] paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 pmaddubsw m13, [r5] %ifidn %3,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r2 + r3 * 2], m6 movu [r2 + r6], m7 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, [r5 + 3 * mmsize] paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 pmaddubsw m2, m0, [r5 + 1 * mmsize] paddw m12, m2 pmaddubsw m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 pmaddubsw m3, m1, [r5 + 1 * mmsize] paddw m13, m3 pmaddubsw m1, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 pmaddubsw m4, m2, [r5 + 2 * mmsize] paddw m12, m4 pmaddubsw m2, [r5 + 1 * mmsize] paddw m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 pmaddubsw m5, m3, [r5 + 2 * mmsize] paddw m13, m5 pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 movu xm5, [r0 + r4] ; m5 = row 19 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 3 * mmsize] paddw m12, m6 pmaddubsw m4, [r5 + 2 * mmsize] paddw m0, m4 lea r0, [r0 + r1 * 4] movu xm6, [r0] ; m6 = row 20 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 3 * mmsize] paddw m13, m7 pmaddubsw m5, [r5 + 2 * mmsize] paddw m1, m5 movu xm7, [r0 + r1] ; m7 = row 21 punpckhbw xm2, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddubsw m6, [r5 + 3 * mmsize] paddw m0, m6 movu xm2, [r0 + r1 * 2] ; m2 = row 22 punpckhbw xm3, xm7, xm2 punpcklbw xm7, xm2 vinserti128 m7, m7, xm3, 1 pmaddubsw m7, [r5 + 3 * mmsize] paddw m1, m7 %ifidn %3,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m12, m14 ; m12 = word: row 12 pmulhrsw m13, m14 ; m13 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m12, m13 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r6], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm12 movu [r2 + r3], xm13 movu [r2 + r3 * 2], xm0 movu [r2 + r6], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m12, m14 ; m12 = word: row 12 psubw m13, m14 ; m13 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r2], m8 movu [r2 + r3], m9 movu [r2 + r3 * 2], m10 movu [r2 + r6], m11 lea r2, [r2 + r3 * 4] movu [r2], m12 movu [r2 + r3], m13 movu [r2 + r3 * 2], m0 movu [r2 + r6], m1 %endif lea r2, [r2 + r3 * 4] sub r0, r7 dec r8d jnz .loop RET %endif %endmacro FILTER_VER_LUMA_AVX2_16xN 16, 32, pp FILTER_VER_LUMA_AVX2_16xN 16, 64, pp FILTER_VER_LUMA_AVX2_16xN 16, 32, ps FILTER_VER_LUMA_AVX2_16xN 16, 64, ps %macro PROCESS_LUMA_AVX2_W16_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r7 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r7 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 pmaddubsw m8, [r5] movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 pmaddubsw m9, [r5] movu xm11, [r7 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 pmaddubsw m12, m10, [r5 + 1 * mmsize] paddw m8, m12 pmaddubsw m10, [r5] lea r7, [r7 + r1 * 4] movu xm12, [r7] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 pmaddubsw m13, m11, [r5 + 1 * mmsize] paddw m9, m13 pmaddubsw m11, [r5] %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r8, [r2 + r3 * 4] movu [r8], xm4 movu [r8 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 lea r8, [r2 + r3 * 4] movu [r8], m4 movu [r8 + r3], m5 %endif movu xm13, [r7 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 pmaddubsw m0, m12, [r5 + 2 * mmsize] paddw m8, m0 pmaddubsw m0, m12, [r5 + 1 * mmsize] paddw m10, m0 pmaddubsw m12, [r5] movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 pmaddubsw m1, m13, [r5 + 2 * mmsize] paddw m9, m1 pmaddubsw m1, m13, [r5 + 1 * mmsize] paddw m11, m1 pmaddubsw m13, [r5] %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r8 + r3 * 2], m6 movu [r8 + r6], m7 %endif lea r8, [r8 + r3 * 4] movu xm1, [r7 + r4] ; m1 = row 15 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m2, m0, [r5 + 3 * mmsize] paddw m8, m2 pmaddubsw m2, m0, [r5 + 2 * mmsize] paddw m10, m2 pmaddubsw m2, m0, [r5 + 1 * mmsize] paddw m12, m2 pmaddubsw m0, [r5] lea r7, [r7 + r1 * 4] movu xm2, [r7] ; m2 = row 16 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 3 * mmsize] paddw m9, m3 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m11, m3 pmaddubsw m3, m1, [r5 + 1 * mmsize] paddw m13, m3 pmaddubsw m1, [r5] movu xm3, [r7 + r1] ; m3 = row 17 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 3 * mmsize] paddw m10, m4 pmaddubsw m4, m2, [r5 + 2 * mmsize] paddw m12, m4 pmaddubsw m2, [r5 + 1 * mmsize] paddw m0, m2 movu xm4, [r7 + r1 * 2] ; m4 = row 18 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 3 * mmsize] paddw m11, m5 pmaddubsw m5, m3, [r5 + 2 * mmsize] paddw m13, m5 pmaddubsw m3, [r5 + 1 * mmsize] paddw m1, m3 movu xm5, [r7 + r4] ; m5 = row 19 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 3 * mmsize] paddw m12, m6 pmaddubsw m4, [r5 + 2 * mmsize] paddw m0, m4 lea r7, [r7 + r1 * 4] movu xm6, [r7] ; m6 = row 20 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 3 * mmsize] paddw m13, m7 pmaddubsw m5, [r5 + 2 * mmsize] paddw m1, m5 movu xm7, [r7 + r1] ; m7 = row 21 punpckhbw xm2, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddubsw m6, [r5 + 3 * mmsize] paddw m0, m6 movu xm2, [r7 + r1 * 2] ; m2 = row 22 punpckhbw xm3, xm7, xm2 punpcklbw xm7, xm2 vinserti128 m7, m7, xm3, 1 pmaddubsw m7, [r5 + 3 * mmsize] paddw m1, m7 %ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 pmulhrsw m11, m14 ; m11 = word: row 11 pmulhrsw m12, m14 ; m12 = word: row 12 pmulhrsw m13, m14 ; m13 = word: row 13 pmulhrsw m0, m14 ; m0 = word: row 14 pmulhrsw m1, m14 ; m1 = word: row 15 packuswb m8, m9 packuswb m10, m11 packuswb m12, m13 packuswb m0, m1 vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 movu [r8], xm8 movu [r8 + r3], xm9 movu [r8 + r3 * 2], xm10 movu [r8 + r6], xm11 lea r8, [r8 + r3 * 4] movu [r8], xm12 movu [r8 + r3], xm13 movu [r8 + r3 * 2], xm0 movu [r8 + r6], xm1 %else psubw m8, m14 ; m8 = word: row 8 psubw m9, m14 ; m9 = word: row 9 psubw m10, m14 ; m10 = word: row 10 psubw m11, m14 ; m11 = word: row 11 psubw m12, m14 ; m12 = word: row 12 psubw m13, m14 ; m13 = word: row 13 psubw m0, m14 ; m0 = word: row 14 psubw m1, m14 ; m1 = word: row 15 movu [r8], m8 movu [r8 + r3], m9 movu [r8 + r3 * 2], m10 movu [r8 + r6], m11 lea r8, [r8 + r3 * 4] movu [r8], m12 movu [r8 + r3], m13 movu [r8 + r3 * 2], m0 movu [r8 + r6], m1 %endif %endmacro %macro PROCESS_LUMA_AVX2_W16_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhbw xm2, xm0, xm1 punpcklbw xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddubsw m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhbw xm3, xm1, xm2 punpcklbw xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddubsw m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhbw xm4, xm2, xm3 punpcklbw xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddubsw m4, m2, [r5 + 1 * mmsize] paddw m0, m4 pmaddubsw m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhbw xm5, xm3, xm4 punpcklbw xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddubsw m5, m3, [r5 + 1 * mmsize] paddw m1, m5 pmaddubsw m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhbw xm6, xm4, xm5 punpcklbw xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddubsw m6, m4, [r5 + 2 * mmsize] paddw m0, m6 pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 pmaddubsw m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhbw xm7, xm5, xm6 punpcklbw xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddubsw m7, m5, [r5 + 2 * mmsize] paddw m1, m7 pmaddubsw m7, m5, [r5 + 1 * mmsize] paddw m3, m7 pmaddubsw m5, [r5] movu xm7, [r7 + r4] ; m7 = row 7 punpckhbw xm8, xm6, xm7 punpcklbw xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddubsw m8, m6, [r5 + 3 * mmsize] paddw m0, m8 pmaddubsw m8, m6, [r5 + 2 * mmsize] paddw m2, m8 pmaddubsw m8, m6, [r5 + 1 * mmsize] paddw m4, m8 pmaddubsw m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhbw xm9, xm7, xm8 punpcklbw xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddubsw m9, m7, [r5 + 3 * mmsize] paddw m1, m9 pmaddubsw m9, m7, [r5 + 2 * mmsize] paddw m3, m9 pmaddubsw m9, m7, [r5 + 1 * mmsize] paddw m5, m9 pmaddubsw m7, [r5] movu xm9, [r7 + r1] ; m9 = row 9 punpckhbw xm10, xm8, xm9 punpcklbw xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddubsw m10, m8, [r5 + 3 * mmsize] paddw m2, m10 pmaddubsw m10, m8, [r5 + 2 * mmsize] paddw m4, m10 pmaddubsw m10, m8, [r5 + 1 * mmsize] paddw m6, m10 movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhbw xm11, xm9, xm10 punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 pmaddubsw m11, m9, [r5 + 2 * mmsize] paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 movu xm11, [r7 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddubsw m12, m10, [r5 + 3 * mmsize] paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 lea r7, [r7 + r1 * 4] movu xm12, [r7] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddubsw m13, m11, [r5 + 3 * mmsize] paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 %ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 pmulhrsw m3, m14 ; m3 = word: row 3 pmulhrsw m4, m14 ; m4 = word: row 4 pmulhrsw m5, m14 ; m5 = word: row 5 packuswb m0, m1 packuswb m2, m3 packuswb m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 lea r8, [r2 + r3 * 4] movu [r8], xm4 movu [r8 + r3], xm5 %else psubw m0, m14 ; m0 = word: row 0 psubw m1, m14 ; m1 = word: row 1 psubw m2, m14 ; m2 = word: row 2 psubw m3, m14 ; m3 = word: row 3 psubw m4, m14 ; m4 = word: row 4 psubw m5, m14 ; m5 = word: row 5 movu [r2], m0 movu [r2 + r3], m1 movu [r2 + r3 * 2], m2 movu [r2 + r6], m3 lea r8, [r2 + r3 * 4] movu [r8], m4 movu [r8 + r3], m5 %endif movu xm13, [r7 + r1] ; m13 = row 13 punpckhbw xm0, xm12, xm13 punpcklbw xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddubsw m0, m12, [r5 + 3 * mmsize] paddw m6, m0 movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhbw xm1, xm13, xm0 punpcklbw xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddubsw m1, m13, [r5 + 3 * mmsize] paddw m7, m1 %ifidn %1,pp pmulhrsw m6, m14 ; m6 = word: row 6 pmulhrsw m7, m14 ; m7 = word: row 7 packuswb m6, m7 vpermq m6, m6, 11011000b vextracti128 xm7, m6, 1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %else psubw m6, m14 ; m6 = word: row 6 psubw m7, m14 ; m7 = word: row 7 movu [r8 + r3 * 2], m6 movu [r8 + r6], m7 %endif %endmacro %macro FILTER_VER_LUMA_AVX2_24x32 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d vbroadcasti128 m14, [pw_2000] %else mova m14, [pw_512] %endif lea r6, [r3 * 3] lea r10, [r1 * 4] mov r9d, 2 .loopH: PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 movq xm1, [r0] ; m1 = row 0 movq xm2, [r0 + r1] ; m2 = row 1 punpcklbw xm1, xm2 movq xm3, [r0 + r1 * 2] ; m3 = row 2 punpcklbw xm2, xm3 vinserti128 m5, m1, xm2, 1 pmaddubsw m5, [r5] movq xm4, [r0 + r4] ; m4 = row 3 punpcklbw xm3, xm4 lea r7, [r0 + r1 * 4] movq xm1, [r7] ; m1 = row 4 punpcklbw xm4, xm1 vinserti128 m2, m3, xm4, 1 pmaddubsw m0, m2, [r5 + 1 * mmsize] paddw m5, m0 pmaddubsw m2, [r5] movq xm3, [r7 + r1] ; m3 = row 5 punpcklbw xm1, xm3 movq xm4, [r7 + r1 * 2] ; m4 = row 6 punpcklbw xm3, xm4 vinserti128 m1, m1, xm3, 1 pmaddubsw m3, m1, [r5 + 2 * mmsize] paddw m5, m3 pmaddubsw m0, m1, [r5 + 1 * mmsize] paddw m2, m0 pmaddubsw m1, [r5] movq xm3, [r7 + r4] ; m3 = row 7 punpcklbw xm4, xm3 lea r7, [r7 + r1 * 4] movq xm0, [r7] ; m0 = row 8 punpcklbw xm3, xm0 vinserti128 m4, m4, xm3, 1 pmaddubsw m3, m4, [r5 + 3 * mmsize] paddw m5, m3 pmaddubsw m3, m4, [r5 + 2 * mmsize] paddw m2, m3 pmaddubsw m3, m4, [r5 + 1 * mmsize] paddw m1, m3 pmaddubsw m4, [r5] movq xm3, [r7 + r1] ; m3 = row 9 punpcklbw xm0, xm3 movq xm6, [r7 + r1 * 2] ; m6 = row 10 punpcklbw xm3, xm6 vinserti128 m0, m0, xm3, 1 pmaddubsw m3, m0, [r5 + 3 * mmsize] paddw m2, m3 pmaddubsw m3, m0, [r5 + 2 * mmsize] paddw m1, m3 pmaddubsw m3, m0, [r5 + 1 * mmsize] paddw m4, m3 pmaddubsw m0, [r5] movq xm3, [r7 + r4] ; m3 = row 11 punpcklbw xm6, xm3 lea r7, [r7 + r1 * 4] movq xm7, [r7] ; m7 = row 12 punpcklbw xm3, xm7 vinserti128 m6, m6, xm3, 1 pmaddubsw m3, m6, [r5 + 3 * mmsize] paddw m1, m3 pmaddubsw m3, m6, [r5 + 2 * mmsize] paddw m4, m3 pmaddubsw m3, m6, [r5 + 1 * mmsize] paddw m0, m3 pmaddubsw m6, [r5] movq xm3, [r7 + r1] ; m3 = row 13 punpcklbw xm7, xm3 movq xm8, [r7 + r1 * 2] ; m8 = row 14 punpcklbw xm3, xm8 vinserti128 m7, m7, xm3, 1 pmaddubsw m3, m7, [r5 + 3 * mmsize] paddw m4, m3 pmaddubsw m3, m7, [r5 + 2 * mmsize] paddw m0, m3 pmaddubsw m3, m7, [r5 + 1 * mmsize] paddw m6, m3 pmaddubsw m7, [r5] movq xm3, [r7 + r4] ; m3 = row 15 punpcklbw xm8, xm3 lea r7, [r7 + r1 * 4] movq xm9, [r7] ; m9 = row 16 punpcklbw xm3, xm9 vinserti128 m8, m8, xm3, 1 pmaddubsw m3, m8, [r5 + 3 * mmsize] paddw m0, m3 pmaddubsw m3, m8, [r5 + 2 * mmsize] paddw m6, m3 pmaddubsw m3, m8, [r5 + 1 * mmsize] paddw m7, m3 pmaddubsw m8, [r5] movq xm3, [r7 + r1] ; m3 = row 17 punpcklbw xm9, xm3 movq xm10, [r7 + r1 * 2] ; m10 = row 18 punpcklbw xm3, xm10 vinserti128 m9, m9, xm3, 1 pmaddubsw m3, m9, [r5 + 3 * mmsize] paddw m6, m3 pmaddubsw m3, m9, [r5 + 2 * mmsize] paddw m7, m3 pmaddubsw m3, m9, [r5 + 1 * mmsize] paddw m8, m3 movq xm3, [r7 + r4] ; m3 = row 19 punpcklbw xm10, xm3 lea r7, [r7 + r1 * 4] movq xm9, [r7] ; m9 = row 20 punpcklbw xm3, xm9 vinserti128 m10, m10, xm3, 1 pmaddubsw m3, m10, [r5 + 3 * mmsize] paddw m7, m3 pmaddubsw m3, m10, [r5 + 2 * mmsize] paddw m8, m3 movq xm3, [r7 + r1] ; m3 = row 21 punpcklbw xm9, xm3 movq xm10, [r7 + r1 * 2] ; m10 = row 22 punpcklbw xm3, xm10 vinserti128 m9, m9, xm3, 1 pmaddubsw m3, m9, [r5 + 3 * mmsize] paddw m8, m3 %ifidn %1,pp pmulhrsw m5, m14 ; m5 = word: row 0, row 1 pmulhrsw m2, m14 ; m2 = word: row 2, row 3 pmulhrsw m1, m14 ; m1 = word: row 4, row 5 pmulhrsw m4, m14 ; m4 = word: row 6, row 7 pmulhrsw m0, m14 ; m0 = word: row 8, row 9 pmulhrsw m6, m14 ; m6 = word: row 10, row 11 pmulhrsw m7, m14 ; m7 = word: row 12, row 13 pmulhrsw m8, m14 ; m8 = word: row 14, row 15 packuswb m5, m2 packuswb m1, m4 packuswb m0, m6 packuswb m7, m8 vextracti128 xm2, m5, 1 vextracti128 xm4, m1, 1 vextracti128 xm6, m0, 1 vextracti128 xm8, m7, 1 movq [r2], xm5 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm5 movhps [r2 + r6], xm2 lea r8, [r2 + r3 * 4] movq [r8], xm1 movq [r8 + r3], xm4 movhps [r8 + r3 * 2], xm1 movhps [r8 + r6], xm4 lea r8, [r8 + r3 * 4] movq [r8], xm0 movq [r8 + r3], xm6 movhps [r8 + r3 * 2], xm0 movhps [r8 + r6], xm6 lea r8, [r8 + r3 * 4] movq [r8], xm7 movq [r8 + r3], xm8 movhps [r8 + r3 * 2], xm7 movhps [r8 + r6], xm8 %else psubw m5, m14 ; m5 = word: row 0, row 1 psubw m2, m14 ; m2 = word: row 2, row 3 psubw m1, m14 ; m1 = word: row 4, row 5 psubw m4, m14 ; m4 = word: row 6, row 7 psubw m0, m14 ; m0 = word: row 8, row 9 psubw m6, m14 ; m6 = word: row 10, row 11 psubw m7, m14 ; m7 = word: row 12, row 13 psubw m8, m14 ; m8 = word: row 14, row 15 vextracti128 xm3, m5, 1 movu [r2], xm5 movu [r2 + r3], xm3 vextracti128 xm3, m2, 1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 vextracti128 xm3, m1, 1 lea r8, [r2 + r3 * 4] movu [r8], xm1 movu [r8 + r3], xm3 vextracti128 xm3, m4, 1 movu [r8 + r3 * 2], xm4 movu [r8 + r6], xm3 vextracti128 xm3, m0, 1 lea r8, [r8 + r3 * 4] movu [r8], xm0 movu [r8 + r3], xm3 vextracti128 xm3, m6, 1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm3 vextracti128 xm3, m7, 1 lea r8, [r8 + r3 * 4] movu [r8], xm7 movu [r8 + r3], xm3 vextracti128 xm3, m8, 1 movu [r8 + r3 * 2], xm8 movu [r8 + r6], xm3 %endif sub r7, r10 lea r0, [r7 - 16] %ifidn %1,pp lea r2, [r8 + r3 * 4 - 16] %else lea r2, [r8 + r3 * 4 - 32] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_AVX2_24x32 pp FILTER_VER_LUMA_AVX2_24x32 ps %macro FILTER_VER_LUMA_AVX2_32xN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %3,ps add r3d, r3d vbroadcasti128 m14, [pw_2000] %else mova m14, [pw_512] %endif lea r6, [r3 * 3] lea r11, [r1 * 4] mov r9d, %2 / 16 .loopH: mov r10d, %1 / 16 .loopW: PROCESS_LUMA_AVX2_W16_16R %3 %ifidn %3,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r10d jnz .loopW sub r7, r11 lea r0, [r7 - 16] %ifidn %3,pp lea r2, [r8 + r3 * 4 - 16] %else lea r2, [r8 + r3 * 4 - 32] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_AVX2_32xN 32, 32, pp FILTER_VER_LUMA_AVX2_32xN 32, 64, pp FILTER_VER_LUMA_AVX2_32xN 32, 32, ps FILTER_VER_LUMA_AVX2_32xN 32, 64, ps %macro FILTER_VER_LUMA_AVX2_32x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d vbroadcasti128 m14, [pw_2000] %else mova m14, [pw_512] %endif lea r6, [r3 * 3] mov r9d, 2 .loopW: PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_AVX2_32x16 pp FILTER_VER_LUMA_AVX2_32x16 ps %macro FILTER_VER_LUMA_AVX2_32x24 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d %endif lea r6, [r3 * 3] %ifidn %1,pp mova m14, [pw_512] %else vbroadcasti128 m14, [pw_2000] %endif mov r9d, 2 .loopW: PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r9d jnz .loopW lea r9, [r1 * 4] sub r7, r9 lea r0, [r7 - 16] %ifidn %1,pp lea r2, [r8 + r3 * 4 - 16] %else lea r2, [r8 + r3 * 4 - 32] %endif mov r9d, 2 .loop: PROCESS_LUMA_AVX2_W16_8R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r9d jnz .loop RET %endif %endmacro FILTER_VER_LUMA_AVX2_32x24 pp FILTER_VER_LUMA_AVX2_32x24 ps %macro FILTER_VER_LUMA_AVX2_32x8 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d %endif lea r6, [r3 * 3] %ifidn %1,pp mova m14, [pw_512] %else vbroadcasti128 m14, [pw_2000] %endif mov r9d, 2 .loopW: PROCESS_LUMA_AVX2_W16_8R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_AVX2_32x8 pp FILTER_VER_LUMA_AVX2_32x8 ps %macro FILTER_VER_LUMA_AVX2_48x64 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d %endif lea r6, [r3 * 3] lea r11, [r1 * 4] %ifidn %1,pp mova m14, [pw_512] %else vbroadcasti128 m14, [pw_2000] %endif mov r9d, 4 .loopH: mov r10d, 3 .loopW: PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r10d jnz .loopW sub r7, r11 lea r0, [r7 - 32] %ifidn %1,pp lea r2, [r8 + r3 * 4 - 32] %else lea r2, [r8 + r3 * 4 - 64] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_AVX2_48x64 pp FILTER_VER_LUMA_AVX2_48x64 ps %macro FILTER_VER_LUMA_AVX2_64xN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %3,ps add r3d, r3d %endif lea r6, [r3 * 3] lea r11, [r1 * 4] %ifidn %3,pp mova m14, [pw_512] %else vbroadcasti128 m14, [pw_2000] %endif mov r9d, %2 / 16 .loopH: mov r10d, %1 / 16 .loopW: PROCESS_LUMA_AVX2_W16_16R %3 %ifidn %3,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r10d jnz .loopW sub r7, r11 lea r0, [r7 - 48] %ifidn %3,pp lea r2, [r8 + r3 * 4 - 48] %else lea r2, [r8 + r3 * 4 - 96] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_AVX2_64xN 64, 32, pp FILTER_VER_LUMA_AVX2_64xN 64, 48, pp FILTER_VER_LUMA_AVX2_64xN 64, 64, pp FILTER_VER_LUMA_AVX2_64xN 64, 32, ps FILTER_VER_LUMA_AVX2_64xN 64, 48, ps FILTER_VER_LUMA_AVX2_64xN 64, 64, ps %macro FILTER_VER_LUMA_AVX2_64x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 mov r4d, r4m shl r4d, 7 %ifdef PIC lea r5, [tab_LumaCoeffVer_32] add r5, r4 %else lea r5, [tab_LumaCoeffVer_32 + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,ps add r3d, r3d %endif lea r6, [r3 * 3] %ifidn %1,pp mova m14, [pw_512] %else vbroadcasti128 m14, [pw_2000] %endif mov r9d, 4 .loopW: PROCESS_LUMA_AVX2_W16_16R %1 %ifidn %1,pp add r2, 16 %else add r2, 32 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_AVX2_64x16 pp FILTER_VER_LUMA_AVX2_64x16 ps ;------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA 3 INIT_XMM sse4 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize lea r5, [3 * r1] sub r0, r5 shl r4d, 6 %ifidn %3,ps add r3d, r3d %endif %ifdef PIC lea r5, [tab_LumaCoeffVer] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffVer + r4] %endif %ifidn %3,pp mova m3, [pw_512] %else mova m3, [pw_2000] %endif mov dword [rsp], %2/4 .loopH: mov r4d, (%1/8) .loopW: PROCESS_LUMA_W8_4R %ifidn %3,pp pmulhrsw m7, m3 pmulhrsw m6, m3 pmulhrsw m5, m3 pmulhrsw m4, m3 packuswb m7, m6 packuswb m5, m4 movlps [r2], m7 movhps [r2 + r3], m7 lea r5, [r2 + 2 * r3] movlps [r5], m5 movhps [r5 + r3], m5 %else psubw m7, m3 psubw m6, m3 psubw m5, m3 psubw m4, m3 movu [r2], m7 movu [r2 + r3], m6 lea r5, [r2 + 2 * r3] movu [r5], m5 movu [r5 + r3], m4 %endif lea r5, [8 * r1 - 8] sub r0, r5 %ifidn %3,pp add r2, 8 %else add r2, 16 %endif dec r4d jnz .loopW lea r0, [r0 + 4 * r1 - %1] %ifidn %3,pp lea r2, [r2 + 4 * r3 - %1] %else lea r2, [r2 + 4 * r3 - 2 * %1] %endif dec dword [rsp] jnz .loopH RET %endmacro FILTER_VER_LUMA 16, 4, pp FILTER_VER_LUMA 16, 8, pp FILTER_VER_LUMA 16, 12, pp FILTER_VER_LUMA 16, 16, pp FILTER_VER_LUMA 16, 32, pp FILTER_VER_LUMA 16, 64, pp FILTER_VER_LUMA 24, 32, pp FILTER_VER_LUMA 32, 8, pp FILTER_VER_LUMA 32, 16, pp FILTER_VER_LUMA 32, 24, pp FILTER_VER_LUMA 32, 32, pp FILTER_VER_LUMA 32, 64, pp FILTER_VER_LUMA 48, 64, pp FILTER_VER_LUMA 64, 16, pp FILTER_VER_LUMA 64, 32, pp FILTER_VER_LUMA 64, 48, pp FILTER_VER_LUMA 64, 64, pp FILTER_VER_LUMA 16, 4, ps FILTER_VER_LUMA 16, 8, ps FILTER_VER_LUMA 16, 12, ps FILTER_VER_LUMA 16, 16, ps FILTER_VER_LUMA 16, 32, ps FILTER_VER_LUMA 16, 64, ps FILTER_VER_LUMA 24, 32, ps FILTER_VER_LUMA 32, 8, ps FILTER_VER_LUMA 32, 16, ps FILTER_VER_LUMA 32, 24, ps FILTER_VER_LUMA 32, 32, ps FILTER_VER_LUMA 32, 64, ps FILTER_VER_LUMA 48, 64, ps FILTER_VER_LUMA 64, 16, ps FILTER_VER_LUMA 64, 32, ps FILTER_VER_LUMA 64, 48, ps FILTER_VER_LUMA 64, 64, ps %macro PROCESS_LUMA_SP_W4_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m1, m4 ;m1=[1 2] pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[2 3] pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 pmaddwd m4, [r6 + 1 * 16] paddd m0, m4 ;m0=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[3 4] pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 pmaddwd m5, [r6 + 1 * 16] paddd m1, m5 ;m1 = [1+2+3+4] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[4 5] pmaddwd m6, m4, [r6 + 1 * 16] paddd m2, m6 ;m2=[2+3+4+5] Row3 pmaddwd m4, [r6 + 2 * 16] paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[5 6] pmaddwd m6, m5, [r6 + 1 * 16] paddd m3, m6 ;m3=[3+4+5+6] Row4 pmaddwd m5, [r6 + 2 * 16] paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[6 7] pmaddwd m6, m4, [r6 + 2 * 16] paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 pmaddwd m4, [r6 + 3 * 16] paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[7 8] pmaddwd m6, m5, [r6 + 2 * 16] paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 pmaddwd m5, [r6 + 3 * 16] paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[8 9] pmaddwd m4, [r6 + 3 * 16] paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end movq m4, [r0 + 2 * r1] punpcklwd m5, m4 ;m5=[9 10] pmaddwd m5, [r6 + 3 * 16] paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end %endmacro ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_SP 2 INIT_XMM sse4 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize add r1d, r1d lea r5, [r1 + 2 * r1] sub r0, r5 shl r4d, 6 %ifdef PIC lea r5, [tab_LumaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffV + r4] %endif mova m7, [pd_526336] mov dword [rsp], %2/4 .loopH: mov r4d, (%1/4) .loopW: PROCESS_LUMA_SP_W4_4R paddd m0, m7 paddd m1, m7 paddd m2, m7 paddd m3, m7 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 packssdw m0, m1 packssdw m2, m3 packuswb m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 lea r5, [r2 + 2 * r3] pextrd [r5], m0, 2 pextrd [r5 + r3], m0, 3 lea r5, [8 * r1 - 2 * 4] sub r0, r5 add r2, 4 dec r4d jnz .loopW lea r0, [r0 + 4 * r1 - 2 * %1] lea r2, [r2 + 4 * r3 - %1] dec dword [rsp] jnz .loopH RET %endmacro ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- FILTER_VER_LUMA_SP 4, 4 FILTER_VER_LUMA_SP 8, 8 FILTER_VER_LUMA_SP 8, 4 FILTER_VER_LUMA_SP 4, 8 FILTER_VER_LUMA_SP 16, 16 FILTER_VER_LUMA_SP 16, 8 FILTER_VER_LUMA_SP 8, 16 FILTER_VER_LUMA_SP 16, 12 FILTER_VER_LUMA_SP 12, 16 FILTER_VER_LUMA_SP 16, 4 FILTER_VER_LUMA_SP 4, 16 FILTER_VER_LUMA_SP 32, 32 FILTER_VER_LUMA_SP 32, 16 FILTER_VER_LUMA_SP 16, 32 FILTER_VER_LUMA_SP 32, 24 FILTER_VER_LUMA_SP 24, 32 FILTER_VER_LUMA_SP 32, 8 FILTER_VER_LUMA_SP 8, 32 FILTER_VER_LUMA_SP 64, 64 FILTER_VER_LUMA_SP 64, 32 FILTER_VER_LUMA_SP 32, 64 FILTER_VER_LUMA_SP 64, 48 FILTER_VER_LUMA_SP 48, 64 FILTER_VER_LUMA_SP 64, 16 FILTER_VER_LUMA_SP 16, 64 ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal filterPixelToShort_4x2, 3, 4, 3 mov r3d, r3m add r3d, r3d ; load constant mova m1, [pb_128] mova m2, [tab_c_64_n64] movd m0, [r0] pinsrd m0, [r0 + r1], 1 punpcklbw m0, m1 pmaddubsw m0, m2 movq [r2 + r3 * 0], m0 movhps [r2 + r3 * 1], m0 RET ;----------------------------------------------------------------------------- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;----------------------------------------------------------------------------- INIT_XMM ssse3 cglobal filterPixelToShort_8x2, 3, 4, 3 mov r3d, r3m add r3d, r3d ; load constant mova m1, [pb_128] mova m2, [tab_c_64_n64] movh m0, [r0] punpcklbw m0, m1 pmaddubsw m0, m2 movu [r2 + r3 * 0], m0 movh m0, [r0 + r1] punpcklbw m0, m1 pmaddubsw m0, m2 movu [r2 + r3 * 1], m0 RET %macro PROCESS_CHROMA_SP_W4_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m1, m4 ;m1=[1 2] pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[2 3] pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 pmaddwd m4, [r6 + 1 * 16] paddd m0, m4 ;m0=[0+1+2+3] Row1 done lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[3 4] pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 pmaddwd m5, [r6 + 1 * 16] paddd m1, m5 ;m1 = [1+2+3+4] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[4 5] pmaddwd m4, [r6 + 1 * 16] paddd m2, m4 ;m2=[2+3+4+5] Row3 movq m4, [r0 + 2 * r1] punpcklwd m5, m4 ;m5=[5 6] pmaddwd m5, [r6 + 1 * 16] paddd m3, m5 ;m3=[3+4+5+6] Row4 %endmacro ;-------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SP 2 INIT_XMM sse4 cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize add r1d, r1d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_ChromaCoeffV + r4] %endif mova m6, [pd_526336] mov dword [rsp], %2/4 .loopH: mov r4d, (%1/4) .loopW: PROCESS_CHROMA_SP_W4_4R paddd m0, m6 paddd m1, m6 paddd m2, m6 paddd m3, m6 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 packssdw m0, m1 packssdw m2, m3 packuswb m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 lea r5, [r2 + 2 * r3] pextrd [r5], m0, 2 pextrd [r5 + r3], m0, 3 lea r5, [4 * r1 - 2 * 4] sub r0, r5 add r2, 4 dec r4d jnz .loopW lea r0, [r0 + 4 * r1 - 2 * %1] lea r2, [r2 + 4 * r3 - %1] dec dword [rsp] jnz .loopH RET %endmacro FILTER_VER_CHROMA_SP 4, 4 FILTER_VER_CHROMA_SP 4, 8 FILTER_VER_CHROMA_SP 16, 16 FILTER_VER_CHROMA_SP 16, 8 FILTER_VER_CHROMA_SP 16, 12 FILTER_VER_CHROMA_SP 12, 16 FILTER_VER_CHROMA_SP 16, 4 FILTER_VER_CHROMA_SP 4, 16 FILTER_VER_CHROMA_SP 32, 32 FILTER_VER_CHROMA_SP 32, 16 FILTER_VER_CHROMA_SP 16, 32 FILTER_VER_CHROMA_SP 32, 24 FILTER_VER_CHROMA_SP 24, 32 FILTER_VER_CHROMA_SP 32, 8 FILTER_VER_CHROMA_SP 16, 24 FILTER_VER_CHROMA_SP 16, 64 FILTER_VER_CHROMA_SP 12, 32 FILTER_VER_CHROMA_SP 4, 32 FILTER_VER_CHROMA_SP 32, 64 FILTER_VER_CHROMA_SP 32, 48 FILTER_VER_CHROMA_SP 24, 64 FILTER_VER_CHROMA_SP 64, 64 FILTER_VER_CHROMA_SP 64, 32 FILTER_VER_CHROMA_SP 64, 48 FILTER_VER_CHROMA_SP 48, 64 FILTER_VER_CHROMA_SP 64, 16 %macro PROCESS_CHROMA_SP_W2_4R 1 movd m0, [r0] movd m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] lea r0, [r0 + 2 * r1] movd m2, [r0] punpcklwd m1, m2 ;m1=[1 2] punpcklqdq m0, m1 ;m0=[0 1 1 2] pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 movd m1, [r0 + r1] punpcklwd m2, m1 ;m2=[2 3] lea r0, [r0 + 2 * r1] movd m3, [r0] punpcklwd m1, m3 ;m2=[3 4] punpcklqdq m2, m1 ;m2=[2 3 3 4] pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 movd m1, [r0 + r1] punpcklwd m3, m1 ;m3=[4 5] movd m4, [r0 + 2 * r1] punpcklwd m1, m4 ;m1=[5 6] punpcklqdq m3, m1 ;m2=[4 5 5 6] pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 %endmacro ;------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SP_W2_4R 2 INIT_XMM sse4 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 add r1d, r1d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif mova m5, [pd_526336] mov r4d, (%2/4) .loopH: PROCESS_CHROMA_SP_W2_4R r5 paddd m0, m5 paddd m2, m5 psrad m0, 12 psrad m2, 12 packssdw m0, m2 packuswb m0, m0 pextrw [r2], m0, 0 pextrw [r2 + r3], m0, 1 lea r2, [r2 + 2 * r3] pextrw [r2], m0, 2 pextrw [r2 + r3], m0, 3 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SP_W2_4R 2, 4 FILTER_VER_CHROMA_SP_W2_4R 2, 8 FILTER_VER_CHROMA_SP_W2_4R 2, 16 ;-------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 add r1d, r1d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif mova m4, [pd_526336] movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m2, [r0] punpcklwd m1, m2 ;m1=[1 2] pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 movq m3, [r0 + r1] punpcklwd m2, m3 ;m4=[2 3] pmaddwd m2, [r5 + 1 * 16] paddd m0, m2 ;m0=[0+1+2+3] Row1 done paddd m0, m4 psrad m0, 12 movq m2, [r0 + 2 * r1] punpcklwd m3, m2 ;m5=[3 4] pmaddwd m3, [r5 + 1 * 16] paddd m1, m3 ;m1 = [1+2+3+4] Row2 done paddd m1, m4 psrad m1, 12 packssdw m0, m1 packuswb m0, m0 movd [r2], m0 pextrd [r2 + r3], m0, 1 RET ;------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SP_W6_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 add r1d, r1d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_ChromaCoeffV + r4] %endif mova m6, [pd_526336] mov r4d, %2/4 .loopH: PROCESS_CHROMA_SP_W4_4R paddd m0, m6 paddd m1, m6 paddd m2, m6 paddd m3, m6 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 packssdw m0, m1 packssdw m2, m3 packuswb m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 lea r5, [r2 + 2 * r3] pextrd [r5], m0, 2 pextrd [r5 + r3], m0, 3 lea r5, [4 * r1 - 2 * 4] sub r0, r5 add r2, 4 PROCESS_CHROMA_SP_W2_4R r6 paddd m0, m6 paddd m2, m6 psrad m0, 12 psrad m2, 12 packssdw m0, m2 packuswb m0, m0 pextrw [r2], m0, 0 pextrw [r2 + r3], m0, 1 lea r2, [r2 + 2 * r3] pextrw [r2], m0, 2 pextrw [r2 + r3], m0, 3 sub r0, 2 * 4 lea r2, [r2 + 2 * r3 - 4] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SP_W6_H4 6, 8 FILTER_VER_CHROMA_SP_W6_H4 6, 16 %macro PROCESS_CHROMA_SP_W8_2R 0 movu m1, [r0] movu m3, [r0 + r1] punpcklwd m0, m1, m3 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l punpckhwd m1, m3 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h movu m4, [r0 + 2 * r1] punpcklwd m2, m3, m4 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l punpckhwd m3, m4 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h lea r0, [r0 + 2 * r1] movu m5, [r0 + r1] punpcklwd m6, m4, m5 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum punpckhwd m4, m5 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum movu m4, [r0 + 2 * r1] punpcklwd m6, m5, m4 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum punpckhwd m5, m4 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum %endmacro ;-------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SP_W8_H2 2 INIT_XMM sse2 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 add r1d, r1d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif mova m7, [pd_526336] mov r4d, %2/2 .loopH: PROCESS_CHROMA_SP_W8_2R paddd m0, m7 paddd m1, m7 paddd m2, m7 paddd m3, m7 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 packssdw m0, m1 packssdw m2, m3 packuswb m0, m2 movlps [r2], m0 movhps [r2 + r3], m0 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SP_W8_H2 8, 2 FILTER_VER_CHROMA_SP_W8_H2 8, 4 FILTER_VER_CHROMA_SP_W8_H2 8, 6 FILTER_VER_CHROMA_SP_W8_H2 8, 8 FILTER_VER_CHROMA_SP_W8_H2 8, 16 FILTER_VER_CHROMA_SP_W8_H2 8, 32 FILTER_VER_CHROMA_SP_W8_H2 8, 12 FILTER_VER_CHROMA_SP_W8_H2 8, 64 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro FILTER_HORIZ_CHROMA_2xN 2 INIT_XMM sse4 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride %define coef2 m3 %define Tm0 m2 %define t1 m1 %define t0 m0 dec srcq mov r4d, r4m add dststrided, dststrided %ifdef PIC lea r6, [tab_ChromaCoeff] movd coef2, [r6 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t1, [pw_2000] mova Tm0, [tab_Tm] mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: movh t0, [srcq] pshufb t0, t0, Tm0 pmaddubsw t0, coef2 phaddw t0, t0 psubw t0, t1 movd [dstq], t0 lea srcq, [srcq + srcstrideq] lea dstq, [dstq + dststrideq] dec r4d jnz .loopH RET %endmacro FILTER_HORIZ_CHROMA_2xN 2, 4 FILTER_HORIZ_CHROMA_2xN 2, 8 FILTER_HORIZ_CHROMA_2xN 2, 16 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro FILTER_HORIZ_CHROMA_4xN 2 INIT_XMM sse4 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride %define coef2 m3 %define Tm0 m2 %define t1 m1 %define t0 m0 dec srcq mov r4d, r4m add dststrided, dststrided %ifdef PIC lea r6, [tab_ChromaCoeff] movd coef2, [r6 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t1, [pw_2000] mova Tm0, [tab_Tm] mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: movh t0, [srcq] pshufb t0, t0, Tm0 pmaddubsw t0, coef2 phaddw t0, t0 psubw t0, t1 movlps [dstq], t0 lea srcq, [srcq + srcstrideq] lea dstq, [dstq + dststrideq] dec r4d jnz .loopH RET %endmacro FILTER_HORIZ_CHROMA_4xN 4, 2 FILTER_HORIZ_CHROMA_4xN 4, 4 FILTER_HORIZ_CHROMA_4xN 4, 8 FILTER_HORIZ_CHROMA_4xN 4, 16 FILTER_HORIZ_CHROMA_4xN 4, 32 %macro PROCESS_CHROMA_W6 3 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 psubw %2, %3 movh [dstq], %2 pshufd %2, %2, 2 movd [dstq + 8], %2 %endmacro %macro PROCESS_CHROMA_W12 3 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 psubw %2, %3 movu [dstq], %2 movu %1, [srcq + 8] pshufb %1, %1, Tm0 pmaddubsw %1, coef2 phaddw %1, %1 psubw %1, %3 movh [dstq + 16], %1 %endmacro ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro FILTER_HORIZ_CHROMA 2 INIT_XMM sse4 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride %define coef2 m5 %define Tm0 m4 %define Tm1 m3 %define t2 m2 %define t1 m1 %define t0 m0 dec srcq mov r4d, r4m add dststrided, dststrided %ifdef PIC lea r6, [tab_ChromaCoeff] movd coef2, [r6 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_2000] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: PROCESS_CHROMA_W%1 t0, t1, t2 add srcq, srcstrideq add dstq, dststrideq dec r4d jnz .loopH RET %endmacro FILTER_HORIZ_CHROMA 6, 8 FILTER_HORIZ_CHROMA 12, 16 FILTER_HORIZ_CHROMA 6, 16 FILTER_HORIZ_CHROMA 12, 32 %macro PROCESS_CHROMA_W8 3 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 psubw %2, %3 movu [dstq], %2 %endmacro ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro FILTER_HORIZ_CHROMA_8xN 2 INIT_XMM sse4 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride %define coef2 m5 %define Tm0 m4 %define Tm1 m3 %define t2 m2 %define t1 m1 %define t0 m0 dec srcq mov r4d, r4m add dststrided, dststrided %ifdef PIC lea r6, [tab_ChromaCoeff] movd coef2, [r6 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_2000] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: PROCESS_CHROMA_W8 t0, t1, t2 add srcq, srcstrideq add dstq, dststrideq dec r4d jnz .loopH RET %endmacro FILTER_HORIZ_CHROMA_8xN 8, 2 FILTER_HORIZ_CHROMA_8xN 8, 4 FILTER_HORIZ_CHROMA_8xN 8, 6 FILTER_HORIZ_CHROMA_8xN 8, 8 FILTER_HORIZ_CHROMA_8xN 8, 16 FILTER_HORIZ_CHROMA_8xN 8, 32 FILTER_HORIZ_CHROMA_8xN 8, 12 FILTER_HORIZ_CHROMA_8xN 8, 64 %macro PROCESS_CHROMA_W16 4 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 psubw %2, %3 psubw %4, %3 movu [dstq], %2 movu [dstq + 16], %4 %endmacro %macro PROCESS_CHROMA_W24 4 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 psubw %2, %3 psubw %4, %3 movu [dstq], %2 movu [dstq + 16], %4 movu %1, [srcq + 16] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 psubw %2, %3 movu [dstq + 32], %2 %endmacro %macro PROCESS_CHROMA_W32 4 movu %1, [srcq] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 psubw %2, %3 psubw %4, %3 movu [dstq], %2 movu [dstq + 16], %4 movu %1, [srcq + 16] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + 24] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 psubw %2, %3 psubw %4, %3 movu [dstq + 32], %2 movu [dstq + 48], %4 %endmacro %macro PROCESS_CHROMA_W16o 5 movu %1, [srcq + %5] pshufb %2, %1, Tm0 pmaddubsw %2, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %2, %1 movu %1, [srcq + %5 + 8] pshufb %4, %1, Tm0 pmaddubsw %4, coef2 pshufb %1, %1, Tm1 pmaddubsw %1, coef2 phaddw %4, %1 psubw %2, %3 psubw %4, %3 movu [dstq + %5 * 2], %2 movu [dstq + %5 * 2 + 16], %4 %endmacro %macro PROCESS_CHROMA_W48 4 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 %endmacro %macro PROCESS_CHROMA_W64 4 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 %endmacro ;------------------------------------------------------------------------------------------------------------------------------ ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;------------------------------------------------------------------------------------------------------------------------------ %macro FILTER_HORIZ_CHROMA_WxN 2 INIT_XMM sse4 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride %define coef2 m6 %define Tm0 m5 %define Tm1 m4 %define t3 m3 %define t2 m2 %define t1 m1 %define t0 m0 dec srcq mov r4d, r4m add dststrided, dststrided %ifdef PIC lea r6, [tab_ChromaCoeff] movd coef2, [r6 + r4 * 4] %else movd coef2, [tab_ChromaCoeff + r4 * 4] %endif pshufd coef2, coef2, 0 mova t2, [pw_2000] mova Tm0, [tab_Tm] mova Tm1, [tab_Tm + 16] mov r4d, %2 cmp r5m, byte 0 je .loopH sub srcq, srcstrideq add r4d, 3 .loopH: PROCESS_CHROMA_W%1 t0, t1, t2, t3 add srcq, srcstrideq add dstq, dststrideq dec r4d jnz .loopH RET %endmacro FILTER_HORIZ_CHROMA_WxN 16, 4 FILTER_HORIZ_CHROMA_WxN 16, 8 FILTER_HORIZ_CHROMA_WxN 16, 12 FILTER_HORIZ_CHROMA_WxN 16, 16 FILTER_HORIZ_CHROMA_WxN 16, 32 FILTER_HORIZ_CHROMA_WxN 24, 32 FILTER_HORIZ_CHROMA_WxN 32, 8 FILTER_HORIZ_CHROMA_WxN 32, 16 FILTER_HORIZ_CHROMA_WxN 32, 24 FILTER_HORIZ_CHROMA_WxN 32, 32 FILTER_HORIZ_CHROMA_WxN 16, 24 FILTER_HORIZ_CHROMA_WxN 16, 64 FILTER_HORIZ_CHROMA_WxN 24, 64 FILTER_HORIZ_CHROMA_WxN 32, 48 FILTER_HORIZ_CHROMA_WxN 32, 64 FILTER_HORIZ_CHROMA_WxN 64, 64 FILTER_HORIZ_CHROMA_WxN 64, 32 FILTER_HORIZ_CHROMA_WxN 64, 48 FILTER_HORIZ_CHROMA_WxN 48, 64 FILTER_HORIZ_CHROMA_WxN 64, 16 ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W16n 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] mov r4d, %2/2 .loop: mov r6d, %1/16 .loopW: movu m2, [r0] movu m3, [r0 + r1] punpcklbw m4, m2, m3 punpckhbw m2, m3 pmaddubsw m4, m1 pmaddubsw m2, m1 lea r5, [r0 + 2 * r1] movu m5, [r5] movu m7, [r5 + r1] punpcklbw m6, m5, m7 pmaddubsw m6, m0 paddw m4, m6 punpckhbw m6, m5, m7 pmaddubsw m6, m0 paddw m2, m6 mova m6, [pw_2000] psubw m4, m6 psubw m2, m6 movu [r2], m4 movu [r2 + 16], m2 punpcklbw m4, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m1 pmaddubsw m3, m1 movu m5, [r5 + 2 * r1] punpcklbw m2, m7, m5 punpckhbw m7, m5 pmaddubsw m2, m0 pmaddubsw m7, m0 paddw m4, m2 paddw m3, m7 psubw m4, m6 psubw m3, m6 movu [r2 + r3], m4 movu [r2 + r3 + 16], m3 add r0, 16 add r2, 32 dec r6d jnz .loopW lea r0, [r0 + r1 * 2 - %1] lea r2, [r2 + r3 * 2 - %1 * 2] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W16n 64, 64 FILTER_V_PS_W16n 64, 32 FILTER_V_PS_W16n 64, 48 FILTER_V_PS_W16n 48, 64 FILTER_V_PS_W16n 64, 16 ;------------------------------------------------------------------------------------------------------------ ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------ INIT_XMM sse4 cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] lea r5, [3 * r1] movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m1, m5, m6 punpcklbw m3, m1 pmaddubsw m3, m0 phaddw m2, m3 mova m1, [pw_2000] psubw m2, m1 movd [r2], m2 pextrd [r2 + r3], m2, 2 movd m2, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m2 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m2, m3 punpcklbw m5, m2 pmaddubsw m5, m0 phaddw m4, m5 psubw m4, m1 lea r2, [r2 + 2 * r3] movd [r2], m4 pextrd [r2 + r3], m4, 2 RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------- %macro FILTER_V_PS_W2 2 INIT_XMM sse4 cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 add r3d, r3d %ifdef PIC lea r5, [tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else movd m0, [tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] mova m1, [pw_2000] lea r5, [3 * r1] mov r4d, %2/4 .loop: movd m2, [r0] movd m3, [r0 + r1] movd m4, [r0 + 2 * r1] movd m5, [r0 + r5] punpcklbw m2, m3 punpcklbw m6, m4, m5 punpcklbw m2, m6 pmaddubsw m2, m0 lea r0, [r0 + 4 * r1] movd m6, [r0] punpcklbw m3, m4 punpcklbw m7, m5, m6 punpcklbw m3, m7 pmaddubsw m3, m0 phaddw m2, m3 psubw m2, m1 movd [r2], m2 pshufd m2, m2, 2 movd [r2 + r3], m2 movd m2, [r0 + r1] punpcklbw m4, m5 punpcklbw m3, m6, m2 punpcklbw m4, m3 pmaddubsw m4, m0 movd m3, [r0 + 2 * r1] punpcklbw m5, m6 punpcklbw m2, m3 punpcklbw m5, m2 pmaddubsw m5, m0 phaddw m4, m5 psubw m4, m1 lea r2, [r2 + 2 * r3] movd [r2], m4 pshufd m4 , m4 ,2 movd [r2 + r3], m4 lea r2, [r2 + 2 * r3] dec r4d jnz .loop RET %endmacro FILTER_V_PS_W2 2, 8 FILTER_V_PS_W2 2, 16 ;----------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SS 2 INIT_XMM sse2 cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize add r1d, r1d add r3d, r3d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_ChromaCoeffV + r4] %endif mov dword [rsp], %2/4 .loopH: mov r4d, (%1/4) .loopW: PROCESS_CHROMA_SP_W4_4R psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 packssdw m0, m1 packssdw m2, m3 movlps [r2], m0 movhps [r2 + r3], m0 lea r5, [r2 + 2 * r3] movlps [r5], m2 movhps [r5 + r3], m2 lea r5, [4 * r1 - 2 * 4] sub r0, r5 add r2, 2 * 4 dec r4d jnz .loopW lea r0, [r0 + 4 * r1 - 2 * %1] lea r2, [r2 + 4 * r3 - 2 * %1] dec dword [rsp] jnz .loopH RET %endmacro FILTER_VER_CHROMA_SS 4, 4 FILTER_VER_CHROMA_SS 4, 8 FILTER_VER_CHROMA_SS 16, 16 FILTER_VER_CHROMA_SS 16, 8 FILTER_VER_CHROMA_SS 16, 12 FILTER_VER_CHROMA_SS 12, 16 FILTER_VER_CHROMA_SS 16, 4 FILTER_VER_CHROMA_SS 4, 16 FILTER_VER_CHROMA_SS 32, 32 FILTER_VER_CHROMA_SS 32, 16 FILTER_VER_CHROMA_SS 16, 32 FILTER_VER_CHROMA_SS 32, 24 FILTER_VER_CHROMA_SS 24, 32 FILTER_VER_CHROMA_SS 32, 8 FILTER_VER_CHROMA_SS 16, 24 FILTER_VER_CHROMA_SS 12, 32 FILTER_VER_CHROMA_SS 4, 32 FILTER_VER_CHROMA_SS 32, 64 FILTER_VER_CHROMA_SS 16, 64 FILTER_VER_CHROMA_SS 32, 48 FILTER_VER_CHROMA_SS 24, 64 FILTER_VER_CHROMA_SS 64, 64 FILTER_VER_CHROMA_SS 64, 32 FILTER_VER_CHROMA_SS 64, 48 FILTER_VER_CHROMA_SS 48, 64 FILTER_VER_CHROMA_SS 64, 16 %macro FILTER_VER_CHROMA_S_AVX2_4x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 mov r4d, r4m add r1d, r1d shl r4d, 6 sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m6, [pd_526336] %else add r3d, r3d %endif movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m4, [r5 + 1 * mmsize] paddd m2, m4 %ifidn %1,sp paddd m0, m6 paddd m2, m6 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 vextracti128 xm2, m0, 1 lea r4, [r3 * 3] %ifidn %1,sp packuswb xm0, xm2 movd [r2], xm0 pextrd [r2 + r3], xm0, 2 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r4], xm0, 3 %else movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r4], xm2 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_4x4 sp FILTER_VER_CHROMA_S_AVX2_4x4 ss %macro FILTER_VER_CHROMA_S_AVX2_4x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m5, m4, [r5 + 1 * mmsize] paddd m2, m5 pmaddwd m4, [r5] movq xm3, [r0 + r4] punpcklwd xm1, xm3 lea r0, [r0 + 4 * r1] movq xm6, [r0] punpcklwd xm3, xm6 vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] pmaddwd m5, m1, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m1, [r5] movq xm3, [r0 + r1] punpcklwd xm6, xm3 movq xm5, [r0 + 2 * r1] punpcklwd xm3, xm5 vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] pmaddwd m6, [r5 + 1 * mmsize] paddd m1, m6 lea r4, [r3 * 3] %ifidn %1,sp paddd m0, m7 paddd m2, m7 paddd m4, m7 paddd m1, m7 psrad m0, 12 psrad m2, 12 psrad m4, 12 psrad m1, 12 %else psrad m0, 6 psrad m2, 6 psrad m4, 6 psrad m1, 6 %endif packssdw m0, m2 packssdw m4, m1 %ifidn %1,sp packuswb m0, m4 vextracti128 xm2, m0, 1 movd [r2], xm0 movd [r2 + r3], xm2 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm2, 2 pextrd [r2 + r3 * 2], xm0, 3 pextrd [r2 + r4], xm2, 3 %else vextracti128 xm2, m0, 1 vextracti128 xm1, m4, 1 movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm4 movq [r2 + r3], xm1 movhps [r2 + r3 * 2], xm4 movhps [r2 + r4], xm1 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_4x8 sp FILTER_VER_CHROMA_S_AVX2_4x8 ss %macro PROCESS_CHROMA_AVX2_W4_16R 1 movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m5, m4, [r5 + 1 * mmsize] paddd m2, m5 pmaddwd m4, [r5] movq xm3, [r0 + r4] punpcklwd xm1, xm3 lea r0, [r0 + 4 * r1] movq xm6, [r0] punpcklwd xm3, xm6 vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] pmaddwd m5, m1, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m1, [r5] movq xm3, [r0 + r1] punpcklwd xm6, xm3 movq xm5, [r0 + 2 * r1] punpcklwd xm3, xm5 vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] pmaddwd m3, m6, [r5 + 1 * mmsize] paddd m1, m3 pmaddwd m6, [r5] %ifidn %1,sp paddd m0, m7 paddd m2, m7 paddd m4, m7 paddd m1, m7 psrad m4, 12 psrad m1, 12 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 psrad m4, 6 psrad m1, 6 %endif packssdw m0, m2 packssdw m4, m1 %ifidn %1,sp packuswb m0, m4 vextracti128 xm4, m0, 1 movd [r2], xm0 movd [r2 + r3], xm4 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r6], xm4, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm4, 2 pextrd [r2 + r3 * 2], xm0, 3 pextrd [r2 + r6], xm4, 3 %else vextracti128 xm2, m0, 1 vextracti128 xm1, m4, 1 movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm4 movq [r2 + r3], xm1 movhps [r2 + r3 * 2], xm4 movhps [r2 + r6], xm1 %endif movq xm2, [r0 + r4] punpcklwd xm5, xm2 lea r0, [r0 + 4 * r1] movq xm0, [r0] punpcklwd xm2, xm0 vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] pmaddwd m2, m5, [r5 + 1 * mmsize] paddd m6, m2 pmaddwd m5, [r5] movq xm2, [r0 + r1] punpcklwd xm0, xm2 movq xm3, [r0 + 2 * r1] punpcklwd xm2, xm3 vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] pmaddwd m2, m0, [r5 + 1 * mmsize] paddd m5, m2 pmaddwd m0, [r5] movq xm4, [r0 + r4] punpcklwd xm3, xm4 lea r0, [r0 + 4 * r1] movq xm1, [r0] punpcklwd xm4, xm1 vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] pmaddwd m4, m3, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m3, [r5] movq xm4, [r0 + r1] punpcklwd xm1, xm4 movq xm2, [r0 + 2 * r1] punpcklwd xm4, xm2 vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] pmaddwd m1, [r5 + 1 * mmsize] paddd m3, m1 %ifidn %1,sp paddd m6, m7 paddd m5, m7 paddd m0, m7 paddd m3, m7 psrad m6, 12 psrad m5, 12 psrad m0, 12 psrad m3, 12 %else psrad m6, 6 psrad m5, 6 psrad m0, 6 psrad m3, 6 %endif packssdw m6, m5 packssdw m0, m3 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m6, m0 vextracti128 xm0, m6, 1 movd [r2], xm6 movd [r2 + r3], xm0 pextrd [r2 + r3 * 2], xm6, 1 pextrd [r2 + r6], xm0, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm6, 2 pextrd [r2 + r3], xm0, 2 pextrd [r2 + r3 * 2], xm6, 3 pextrd [r2 + r6], xm0, 3 %else vextracti128 xm5, m6, 1 vextracti128 xm3, m0, 1 movq [r2], xm6 movq [r2 + r3], xm5 movhps [r2 + r3 * 2], xm6 movhps [r2 + r6], xm5 lea r2, [r2 + r3 * 4] movq [r2], xm0 movq [r2 + r3], xm3 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm3 %endif %endmacro %macro FILTER_VER_CHROMA_S_AVX2_4x16 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] PROCESS_CHROMA_AVX2_W4_16R %1 RET %endmacro FILTER_VER_CHROMA_S_AVX2_4x16 sp FILTER_VER_CHROMA_S_AVX2_4x16 ss %macro FILTER_VER_CHROMA_S_AVX2_4x32 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x32, 4, 7, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] %rep 2 PROCESS_CHROMA_AVX2_W4_16R %1 lea r2, [r2 + r3 * 4] %endrep RET %endmacro FILTER_VER_CHROMA_S_AVX2_4x32 sp FILTER_VER_CHROMA_S_AVX2_4x32 ss %macro FILTER_VER_CHROMA_S_AVX2_4x2 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m5, [pd_526336] %else add r3d, r3d %endif movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 movq xm4, [r0 + 4 * r1] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 %ifidn %1,sp paddd m0, m5 psrad m0, 12 %else psrad m0, 6 %endif vextracti128 xm1, m0, 1 packssdw xm0, xm1 %ifidn %1,sp packuswb xm0, xm0 movd [r2], xm0 pextrd [r2 + r3], xm0, 1 %else movq [r2], xm0 movhps [r2 + r3], xm0 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_4x2 sp FILTER_VER_CHROMA_S_AVX2_4x2 ss %macro FILTER_VER_CHROMA_S_AVX2_2x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m5, [pd_526336] %else add r3d, r3d %endif movd xm0, [r0] movd xm1, [r0 + r1] punpcklwd xm0, xm1 movd xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] movd xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movd xm4, [r0] punpcklwd xm3, xm4 punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] movd xm1, [r0 + r1] punpcklwd xm4, xm1 movd xm3, [r0 + r1 * 2] punpcklwd xm1, xm3 punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] pmaddwd m0, [r5] pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 %ifidn %1,sp paddd m0, m5 psrad m0, 12 %else psrad m0, 6 %endif vextracti128 xm1, m0, 1 packssdw xm0, xm1 lea r4, [r3 * 3] %ifidn %1,sp packuswb xm0, xm0 pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + 2 * r3], xm0, 2 pextrw [r2 + r4], xm0, 3 %else movd [r2], xm0 pextrd [r2 + r3], xm0, 1 pextrd [r2 + 2 * r3], xm0, 2 pextrd [r2 + r4], xm0, 3 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_2x4 sp FILTER_VER_CHROMA_S_AVX2_2x4 ss %macro FILTER_VER_CHROMA_S_AVX2_8x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m4 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] pmaddwd m3, [r5] paddd m1, m5 %ifidn %1,sp paddd m0, m7 paddd m1, m7 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m7 paddd m3, m7 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm3, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm3, 1 pmaddwd m3, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m3 lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 %endif lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m7 paddd m5, m7 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r0 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m0, [r5 + 1 * mmsize] paddd m6, m0 movu xm5, [r0 + r1 * 2] ; m5 = row 10 punpckhwd xm0, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm0, 1 pmaddwd m2, [r5 + 1 * mmsize] paddd m1, m2 %ifidn %1,sp paddd m6, m7 paddd m1, m7 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r4], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm5, m4, 1 vextracti128 xm1, m6, 1 movu [r2], xm4 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r4], xm1 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_8x8 sp FILTER_VER_CHROMA_S_AVX2_8x8 ss %macro PROCESS_CHROMA_S_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] %ifidn %1,sp paddd m0, m9 paddd m1, m9 paddd m2, m9 paddd m3, m9 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif movu xm7, [r7 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhwd xm0, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm0, 1 pmaddwd m0, m7, [r5 + 1 * mmsize] paddd m5, m0 pmaddwd m7, [r5] movu xm0, [r7 + r1] ; m0 = row 9 punpckhwd xm1, xm8, xm0 punpcklwd xm8, xm0 vinserti128 m8, m8, xm1, 1 pmaddwd m1, m8, [r5 + 1 * mmsize] paddd m6, m1 pmaddwd m8, [r5] movu xm1, [r7 + r1 * 2] ; m1 = row 10 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m2, m0, [r5 + 1 * mmsize] paddd m7, m2 pmaddwd m0, [r5] %ifidn %1,sp paddd m4, m9 paddd m5, m9 psrad m4, 12 psrad m5, 12 paddd m6, m9 paddd m7, m9 psrad m6, 12 psrad m7, 12 %else psrad m4, 6 psrad m5, 6 psrad m6, 6 psrad m7, 6 %endif packssdw m4, m5 packssdw m6, m7 lea r8, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r8], xm4 movu [r8 + r3], xm5 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %endif movu xm2, [r7 + r4] ; m2 = row 11 punpckhwd xm4, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm4, 1 pmaddwd m4, m1, [r5 + 1 * mmsize] paddd m8, m4 pmaddwd m1, [r5] lea r7, [r7 + r1 * 4] movu xm4, [r7] ; m4 = row 12 punpckhwd xm5, xm2, xm4 punpcklwd xm2, xm4 vinserti128 m2, m2, xm5, 1 pmaddwd m5, m2, [r5 + 1 * mmsize] paddd m0, m5 pmaddwd m2, [r5] movu xm5, [r7 + r1] ; m5 = row 13 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m1, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 14 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m2, m7 pmaddwd m5, [r5] %ifidn %1,sp paddd m8, m9 paddd m0, m9 paddd m1, m9 paddd m2, m9 psrad m8, 12 psrad m0, 12 psrad m1, 12 psrad m2, 12 %else psrad m8, 6 psrad m0, 6 psrad m1, 6 psrad m2, 6 %endif packssdw m8, m0 packssdw m1, m2 lea r8, [r8 + r3 * 4] %ifidn %1,sp packuswb m8, m1 vpermd m8, m3, m8 vextracti128 xm1, m8, 1 movq [r8], xm8 movhps [r8 + r3], xm8 movq [r8 + r3 * 2], xm1 movhps [r8 + r6], xm1 %else vpermq m8, m8, 11011000b vpermq m1, m1, 11011000b vextracti128 xm0, m8, 1 vextracti128 xm2, m1, 1 movu [r8], xm8 movu [r8 + r3], xm0 movu [r8 + r3 * 2], xm1 movu [r8 + r6], xm2 %endif lea r8, [r8 + r3 * 4] movu xm7, [r7 + r4] ; m7 = row 15 punpckhwd xm2, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddwd m2, m6, [r5 + 1 * mmsize] paddd m4, m2 pmaddwd m6, [r5] lea r7, [r7 + r1 * 4] movu xm2, [r7] ; m2 = row 16 punpckhwd xm1, xm7, xm2 punpcklwd xm7, xm2 vinserti128 m7, m7, xm1, 1 pmaddwd m1, m7, [r5 + 1 * mmsize] paddd m5, m1 pmaddwd m7, [r5] movu xm1, [r7 + r1] ; m1 = row 17 punpckhwd xm0, xm2, xm1 punpcklwd xm2, xm1 vinserti128 m2, m2, xm0, 1 pmaddwd m2, [r5 + 1 * mmsize] paddd m6, m2 movu xm0, [r7 + r1 * 2] ; m0 = row 18 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m1, [r5 + 1 * mmsize] paddd m7, m1 %ifidn %1,sp paddd m4, m9 paddd m5, m9 paddd m6, m9 paddd m7, m9 psrad m4, 12 psrad m5, 12 psrad m6, 12 psrad m7, 12 %else psrad m4, 6 psrad m5, 6 psrad m6, 6 psrad m7, 6 %endif packssdw m4, m5 packssdw m6, m7 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r8], xm4 movu [r8 + r3], xm5 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %endif %endmacro %macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m9, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: PROCESS_CHROMA_S_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64 FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64 %macro FILTER_VER_CHROMA_S_AVX2_NxN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %3,sp mova m9, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 16 .loopH: mov r10d, %1 / 8 .loopW: PROCESS_CHROMA_S_AVX2_W8_16R %3 %ifidn %3,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r10d jnz .loopW lea r0, [r7 - 2 * %1 + 16] %ifidn %3,sp lea r2, [r8 + r3 * 4 - %1 + 8] %else lea r2, [r8 + r3 * 4 - 2 * %1 + 16] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, sp FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, sp FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, sp FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, sp FILTER_VER_CHROMA_S_AVX2_NxN 32, 48, ss FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss %macro PROCESS_CHROMA_S_AVX2_W8_4R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m4, [r5 + 1 * mmsize] paddd m2, m4 movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm4, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm4, 1 pmaddwd m5, [r5 + 1 * mmsize] paddd m3, m5 %ifidn %1,sp paddd m0, m7 paddd m1, m7 paddd m2, m7 paddd m3, m7 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 %endif %endmacro %macro FILTER_VER_CHROMA_S_AVX2_8x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif PROCESS_CHROMA_S_AVX2_W8_4R %1 lea r4, [r3 * 3] %ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 %else movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_8x4 sp FILTER_VER_CHROMA_S_AVX2_8x4 ss %macro FILTER_VER_CHROMA_S_AVX2_12x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m9, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] PROCESS_CHROMA_S_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 mova m7, m9 PROCESS_CHROMA_AVX2_W4_16R %1 RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_12x16 sp FILTER_VER_CHROMA_S_AVX2_12x16 ss %macro FILTER_VER_CHROMA_S_AVX2_12x32 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_12x32, 4, 9, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1, sp mova m9, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] %rep 2 PROCESS_CHROMA_S_AVX2_W8_16R %1 %ifidn %1, sp add r2, 8 %else add r2, 16 %endif add r0, 16 mova m7, m9 PROCESS_CHROMA_AVX2_W4_16R %1 sub r0, 16 %ifidn %1, sp lea r2, [r2 + r3 * 4 - 8] %else lea r2, [r2 + r3 * 4 - 16] %endif %endrep RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_12x32 sp FILTER_VER_CHROMA_S_AVX2_12x32 ss %macro FILTER_VER_CHROMA_S_AVX2_16x12 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m8, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] %rep 2 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] %ifidn %1,sp paddd m0, m8 paddd m1, m8 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m8 paddd m3, m8 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif lea r8, [r2 + r3 * 4] movu xm1, [r7 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm0, 1 pmaddwd m0, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m0 lea r7, [r7 + r1 * 4] movu xm0, [r7] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m8 paddd m5, m8 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r7 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m5, m0, [r5 + 1 * mmsize] paddd m6, m5 pmaddwd m0, [r5] movu xm5, [r7 + r1 * 2] ; m5 = row 10 punpckhwd xm7, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm7, 1 pmaddwd m7, m2, [r5 + 1 * mmsize] paddd m1, m7 pmaddwd m2, [r5] %ifidn %1,sp paddd m6, m8 paddd m1, m8 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 movu [r8], xm4 movu [r8 + r3], xm7 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm1 %endif lea r8, [r8 + r3 * 4] movu xm7, [r7 + r4] ; m7 = row 11 punpckhwd xm1, xm5, xm7 punpcklwd xm5, xm7 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] paddd m0, m1 pmaddwd m5, [r5] lea r7, [r7 + r1 * 4] movu xm1, [r7] ; m1 = row 12 punpckhwd xm4, xm7, xm1 punpcklwd xm7, xm1 vinserti128 m7, m7, xm4, 1 pmaddwd m4, m7, [r5 + 1 * mmsize] paddd m2, m4 pmaddwd m7, [r5] %ifidn %1,sp paddd m0, m8 paddd m2, m8 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 movu xm4, [r7 + r1] ; m4 = row 13 punpckhwd xm2, xm1, xm4 punpcklwd xm1, xm4 vinserti128 m1, m1, xm2, 1 pmaddwd m1, [r5 + 1 * mmsize] paddd m5, m1 movu xm2, [r7 + r1 * 2] ; m2 = row 14 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 pmaddwd m4, [r5 + 1 * mmsize] paddd m7, m4 %ifidn %1,sp paddd m5, m8 paddd m7, m8 psrad m5, 12 psrad m7, 12 %else psrad m5, 6 psrad m7, 6 %endif packssdw m5, m7 %ifidn %1,sp packuswb m0, m5 vpermd m0, m3, m0 vextracti128 xm5, m0, 1 movq [r8], xm0 movhps [r8 + r3], xm0 movq [r8 + r3 * 2], xm5 movhps [r8 + r6], xm5 add r2, 8 %else vpermq m0, m0, 11011000b vpermq m5, m5, 11011000b vextracti128 xm7, m0, 1 vextracti128 xm6, m5, 1 movu [r8], xm0 movu [r8 + r3], xm7 movu [r8 + r3 * 2], xm5 movu [r8 + r6], xm6 add r2, 16 %endif add r0, 16 %endrep RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_16x12 sp FILTER_VER_CHROMA_S_AVX2_16x12 ss %macro FILTER_VER_CHROMA_S_AVX2_8x12 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x12, 4, 7, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m8, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] %ifidn %1,sp paddd m0, m8 paddd m1, m8 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m8 paddd m3, m8 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm0, 1 pmaddwd m0, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m0 lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m8 paddd m5, m8 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r0 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m5, m0, [r5 + 1 * mmsize] paddd m6, m5 pmaddwd m0, [r5] movu xm5, [r0 + r1 * 2] ; m5 = row 10 punpckhwd xm7, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm7, 1 pmaddwd m7, m2, [r5 + 1 * mmsize] paddd m1, m7 pmaddwd m2, [r5] %ifidn %1,sp paddd m6, m8 paddd m1, m8 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 movu [r2], xm4 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm1 %endif lea r2, [r2 + r3 * 4] movu xm7, [r0 + r4] ; m7 = row 11 punpckhwd xm1, xm5, xm7 punpcklwd xm5, xm7 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] paddd m0, m1 pmaddwd m5, [r5] lea r0, [r0 + r1 * 4] movu xm1, [r0] ; m1 = row 12 punpckhwd xm4, xm7, xm1 punpcklwd xm7, xm1 vinserti128 m7, m7, xm4, 1 pmaddwd m4, m7, [r5 + 1 * mmsize] paddd m2, m4 pmaddwd m7, [r5] %ifidn %1,sp paddd m0, m8 paddd m2, m8 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 movu xm4, [r0 + r1] ; m4 = row 13 punpckhwd xm2, xm1, xm4 punpcklwd xm1, xm4 vinserti128 m1, m1, xm2, 1 pmaddwd m1, [r5 + 1 * mmsize] paddd m5, m1 movu xm2, [r0 + r1 * 2] ; m2 = row 14 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 pmaddwd m4, [r5 + 1 * mmsize] paddd m7, m4 %ifidn %1,sp paddd m5, m8 paddd m7, m8 psrad m5, 12 psrad m7, 12 %else psrad m5, 6 psrad m7, 6 %endif packssdw m5, m7 %ifidn %1,sp packuswb m0, m5 vpermd m0, m3, m0 vextracti128 xm5, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm5 movhps [r2 + r6], xm5 %else vpermq m0, m0, 11011000b vpermq m5, m5, 11011000b vextracti128 xm7, m0, 1 vextracti128 xm6, m5, 1 movu [r2], xm0 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm5 movu [r2 + r6], xm6 %endif RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_8x12 sp FILTER_VER_CHROMA_S_AVX2_8x12 ss %macro FILTER_VER_CHROMA_S_AVX2_16x4 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif %rep 2 PROCESS_CHROMA_S_AVX2_W8_4R %1 lea r6, [r3 * 3] %ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 add r2, 8 %else movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 add r2, 16 %endif lea r6, [4 * r1 - 16] sub r0, r6 %endrep RET %endmacro FILTER_VER_CHROMA_S_AVX2_16x4 sp FILTER_VER_CHROMA_S_AVX2_16x4 ss %macro PROCESS_CHROMA_S_AVX2_W8_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] %ifidn %1,sp paddd m0, m7 paddd m1, m7 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m7 paddd m3, m7 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif lea r8, [r2 + r3 * 4] movu xm1, [r7 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm0, 1 pmaddwd m0, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m0 lea r7, [r7 + r1 * 4] movu xm0, [r7] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m7 paddd m5, m7 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r7 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m0, [r5 + 1 * mmsize] paddd m6, m0 movu xm5, [r7 + r1 * 2] ; m5 = row 10 punpckhwd xm0, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm0, 1 pmaddwd m2, [r5 + 1 * mmsize] paddd m1, m2 %ifidn %1,sp paddd m6, m7 paddd m1, m7 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 movu [r8], xm4 movu [r8 + r3], xm7 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm1 %endif %endmacro %macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] %rep %2 / 8 PROCESS_CHROMA_S_AVX2_W8_8R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 %endrep RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 %macro FILTER_VER_CHROMA_S_AVX2_8x2 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m5, [pd_526336] %else add r3d, r3d %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 movu xm4, [r0 + r1 * 4] ; m4 = row 4 punpckhwd xm2, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm2, 1 pmaddwd m3, [r5 + 1 * mmsize] paddd m1, m3 %ifidn %1,sp paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 %ifidn %1,sp vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, 11011000b movq [r2], xm0 movhps [r2 + r3], xm0 %else vpermq m0, m0, 11011000b vextracti128 xm1, m0, 1 movu [r2], xm0 movu [r2 + r3], xm1 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_8x2 sp FILTER_VER_CHROMA_S_AVX2_8x2 ss %macro FILTER_VER_CHROMA_S_AVX2_8x6 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m4 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] pmaddwd m3, [r5] paddd m1, m5 %ifidn %1,sp paddd m0, m7 paddd m1, m7 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m7 paddd m3, m7 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm3, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm3, 1 pmaddwd m6, [r5 + 1 * mmsize] paddd m4, m6 movu xm6, [r0 + r1 * 4] ; m6 = row 8 punpckhwd xm3, xm1, xm6 punpcklwd xm1, xm6 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5 + 1 * mmsize] paddd m5, m1 %ifidn %1,sp paddd m4, m7 paddd m5, m7 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 vextracti128 xm5, m4, 1 packuswb xm4, xm5 pshufd xm4, xm4, 11011000b movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm4 movhps [r2 + r3], xm4 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 vextracti128 xm5, m4, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 lea r2, [r2 + r3 * 4] movu [r2], xm4 movu [r2 + r3], xm5 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_8x6 sp FILTER_VER_CHROMA_S_AVX2_8x6 ss %macro FILTER_VER_CHROMA_S_AVX2_8xN 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m8, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] %rep %2 / 16 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] %ifidn %1,sp paddd m0, m8 paddd m1, m8 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m8 paddd m3, m8 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m3, [interp8_hps_shuf] vpermd m0, m3, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b movu [r2], xm0 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm0, 1 pmaddwd m0, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m0 lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m8 paddd m5, m8 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r0 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m5, m0, [r5 + 1 * mmsize] paddd m6, m5 pmaddwd m0, [r5] movu xm5, [r0 + r1 * 2] ; m5 = row 10 punpckhwd xm7, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm7, 1 pmaddwd m7, m2, [r5 + 1 * mmsize] paddd m1, m7 pmaddwd m2, [r5] %ifidn %1,sp paddd m6, m8 paddd m1, m8 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vpermd m4, m3, m4 vextracti128 xm6, m4, 1 movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm7, m4, 1 vextracti128 xm1, m6, 1 movu [r2], xm4 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm1 %endif lea r2, [r2 + r3 * 4] movu xm7, [r0 + r4] ; m7 = row 11 punpckhwd xm1, xm5, xm7 punpcklwd xm5, xm7 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] paddd m0, m1 pmaddwd m5, [r5] lea r0, [r0 + r1 * 4] movu xm1, [r0] ; m1 = row 12 punpckhwd xm4, xm7, xm1 punpcklwd xm7, xm1 vinserti128 m7, m7, xm4, 1 pmaddwd m4, m7, [r5 + 1 * mmsize] paddd m2, m4 pmaddwd m7, [r5] %ifidn %1,sp paddd m0, m8 paddd m2, m8 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 movu xm4, [r0 + r1] ; m4 = row 13 punpckhwd xm2, xm1, xm4 punpcklwd xm1, xm4 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] paddd m5, m2 pmaddwd m1, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 14 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m7, m6 pmaddwd m4, [r5] %ifidn %1,sp paddd m5, m8 paddd m7, m8 psrad m5, 12 psrad m7, 12 %else psrad m5, 6 psrad m7, 6 %endif packssdw m5, m7 %ifidn %1,sp packuswb m0, m5 vpermd m0, m3, m0 vextracti128 xm5, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm5 movhps [r2 + r6], xm5 %else vpermq m0, m0, 11011000b vpermq m5, m5, 11011000b vextracti128 xm7, m0, 1 vextracti128 xm6, m5, 1 movu [r2], xm0 movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm5 movu [r2 + r6], xm6 %endif lea r2, [r2 + r3 * 4] movu xm6, [r0 + r4] ; m6 = row 15 punpckhwd xm5, xm2, xm6 punpcklwd xm2, xm6 vinserti128 m2, m2, xm5, 1 pmaddwd m5, m2, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 16 punpckhwd xm5, xm6, xm0 punpcklwd xm6, xm0 vinserti128 m6, m6, xm5, 1 pmaddwd m5, m6, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m6, [r5] %ifidn %1,sp paddd m1, m8 paddd m4, m8 psrad m1, 12 psrad m4, 12 %else psrad m1, 6 psrad m4, 6 %endif packssdw m1, m4 movu xm5, [r0 + r1] ; m5 = row 17 punpckhwd xm4, xm0, xm5 punpcklwd xm0, xm5 vinserti128 m0, m0, xm4, 1 pmaddwd m0, [r5 + 1 * mmsize] paddd m2, m0 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhwd xm0, xm5, xm4 punpcklwd xm5, xm4 vinserti128 m5, m5, xm0, 1 pmaddwd m5, [r5 + 1 * mmsize] paddd m6, m5 %ifidn %1,sp paddd m2, m8 paddd m6, m8 psrad m2, 12 psrad m6, 12 %else psrad m2, 6 psrad m6, 6 %endif packssdw m2, m6 %ifidn %1,sp packuswb m1, m2 vpermd m1, m3, m1 vextracti128 xm2, m1, 1 movq [r2], xm1 movhps [r2 + r3], xm1 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m1, m1, 11011000b vpermq m2, m2, 11011000b vextracti128 xm6, m1, 1 vextracti128 xm4, m2, 1 movu [r2], xm1 movu [r2 + r3], xm6 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm4 %endif lea r2, [r2 + r3 * 4] %endrep RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 FILTER_VER_CHROMA_S_AVX2_8xN sp, 64 FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 FILTER_VER_CHROMA_S_AVX2_8xN ss, 64 %macro FILTER_VER_CHROMA_S_AVX2_Nx24 2 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m9, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: PROCESS_CHROMA_S_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loopW %ifidn %1,sp lea r2, [r8 + r3 * 4 - %2 + 8] %else lea r2, [r8 + r3 * 4 - 2 * %2 + 16] %endif lea r0, [r7 - 2 * %2 + 16] mova m7, m9 mov r9d, %2 / 8 .loop: PROCESS_CHROMA_S_AVX2_W8_8R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loop RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32 FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16 FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32 FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16 %macro FILTER_VER_CHROMA_S_AVX2_2x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m6, [pd_526336] %else add r3d, r3d %endif movd xm0, [r0] movd xm1, [r0 + r1] punpcklwd xm0, xm1 movd xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] movd xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movd xm4, [r0] punpcklwd xm3, xm4 punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] movd xm1, [r0 + r1] punpcklwd xm4, xm1 movd xm3, [r0 + r1 * 2] punpcklwd xm1, xm3 punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] pmaddwd m0, [r5] pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 movd xm1, [r0 + r4] punpcklwd xm3, xm1 lea r0, [r0 + 4 * r1] movd xm2, [r0] punpcklwd xm1, xm2 punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] movd xm1, [r0 + r1] punpcklwd xm2, xm1 movd xm5, [r0 + r1 * 2] punpcklwd xm1, xm5 punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] pmaddwd m4, [r5] pmaddwd m3, [r5 + 1 * mmsize] paddd m4, m3 %ifidn %1,sp paddd m0, m6 paddd m4, m6 psrad m0, 12 psrad m4, 12 %else psrad m0, 6 psrad m4, 6 %endif packssdw m0, m4 vextracti128 xm4, m0, 1 lea r4, [r3 * 3] %ifidn %1,sp packuswb xm0, xm4 pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + 2 * r3], xm0, 4 pextrw [r2 + r4], xm0, 5 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 2 pextrw [r2 + r3], xm0, 3 pextrw [r2 + 2 * r3], xm0, 6 pextrw [r2 + r4], xm0, 7 %else movd [r2], xm0 pextrd [r2 + r3], xm0, 1 movd [r2 + 2 * r3], xm4 pextrd [r2 + r4], xm4, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm0, 3 pextrd [r2 + 2 * r3], xm4, 2 pextrd [r2 + r4], xm4, 3 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_2x8 sp FILTER_VER_CHROMA_S_AVX2_2x8 ss %macro FILTER_VER_CHROMA_S_AVX2_2x16 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_2x16, 4, 6, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d sub r0, r1 %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] %ifidn %1,sp mova m6, [pd_526336] %else add r3d, r3d %endif movd xm0, [r0] movd xm1, [r0 + r1] punpcklwd xm0, xm1 movd xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] movd xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movd xm4, [r0] punpcklwd xm3, xm4 punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] movd xm1, [r0 + r1] punpcklwd xm4, xm1 movd xm3, [r0 + r1 * 2] punpcklwd xm1, xm3 punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] pmaddwd m0, [r5] pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 movd xm1, [r0 + r4] punpcklwd xm3, xm1 lea r0, [r0 + 4 * r1] movd xm2, [r0] punpcklwd xm1, xm2 punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] movd xm1, [r0 + r1] punpcklwd xm2, xm1 movd xm5, [r0 + r1 * 2] punpcklwd xm1, xm5 punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] pmaddwd m4, [r5] pmaddwd m3, [r5 + 1 * mmsize] paddd m4, m3 movd xm1, [r0 + r4] punpcklwd xm5, xm1 lea r0, [r0 + 4 * r1] movd xm3, [r0] punpcklwd xm1, xm3 punpcklqdq xm5, xm1 ; m5 = [12 11 11 10] vinserti128 m2, m2, xm5, 1 ; m2 = [12 11 11 10 10 9 9 8] movd xm1, [r0 + r1] punpcklwd xm3, xm1 movd xm7, [r0 + r1 * 2] punpcklwd xm1, xm7 punpcklqdq xm3, xm1 ; m3 = [14 13 13 12] vinserti128 m5, m5, xm3, 1 ; m5 = [14 13 13 12 12 11 11 10] pmaddwd m2, [r5] pmaddwd m5, [r5 + 1 * mmsize] paddd m2, m5 movd xm5, [r0 + r4] punpcklwd xm7, xm5 lea r0, [r0 + 4 * r1] movd xm1, [r0] punpcklwd xm5, xm1 punpcklqdq xm7, xm5 ; m7 = [16 15 15 14] vinserti128 m3, m3, xm7, 1 ; m3 = [16 15 15 14 14 13 13 12] movd xm5, [r0 + r1] punpcklwd xm1, xm5 movd xm8, [r0 + r1 * 2] punpcklwd xm5, xm8 punpcklqdq xm1, xm5 ; m1 = [18 17 17 16] vinserti128 m7, m7, xm1, 1 ; m7 = [18 17 17 16 16 15 15 14] pmaddwd m3, [r5] pmaddwd m7, [r5 + 1 * mmsize] paddd m3, m7 %ifidn %1,sp paddd m0, m6 paddd m4, m6 paddd m2, m6 paddd m3, m6 psrad m0, 12 psrad m4, 12 psrad m2, 12 psrad m3, 12 %else psrad m0, 6 psrad m4, 6 psrad m2, 6 psrad m3, 6 %endif packssdw m0, m4 packssdw m2, m3 lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 vextracti128 xm2, m0, 1 pextrw [r2], xm0, 0 pextrw [r2 + r3], xm0, 1 pextrw [r2 + 2 * r3], xm2, 0 pextrw [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 2 pextrw [r2 + r3], xm0, 3 pextrw [r2 + 2 * r3], xm2, 2 pextrw [r2 + r4], xm2, 3 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 4 pextrw [r2 + r3], xm0, 5 pextrw [r2 + 2 * r3], xm2, 4 pextrw [r2 + r4], xm2, 5 lea r2, [r2 + r3 * 4] pextrw [r2], xm0, 6 pextrw [r2 + r3], xm0, 7 pextrw [r2 + 2 * r3], xm2, 6 pextrw [r2 + r4], xm2, 7 %else vextracti128 xm4, m0, 1 vextracti128 xm3, m2, 1 movd [r2], xm0 pextrd [r2 + r3], xm0, 1 movd [r2 + 2 * r3], xm4 pextrd [r2 + r4], xm4, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm0, 3 pextrd [r2 + 2 * r3], xm4, 2 pextrd [r2 + r4], xm4, 3 lea r2, [r2 + r3 * 4] movd [r2], xm2 pextrd [r2 + r3], xm2, 1 movd [r2 + 2 * r3], xm3 pextrd [r2 + r4], xm3, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm2, 2 pextrd [r2 + r3], xm2, 3 pextrd [r2 + 2 * r3], xm3, 2 pextrd [r2 + r4], xm3, 3 %endif RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_2x16 sp FILTER_VER_CHROMA_S_AVX2_2x16 ss %macro FILTER_VER_CHROMA_S_AVX2_6x8 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m4 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] pmaddwd m3, [r5] paddd m1, m5 %ifidn %1,sp paddd m0, m7 paddd m1, m7 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m7 paddd m3, m7 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm3, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm3, 1 pmaddwd m3, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m3 lea r4, [r3 * 3] %ifidn %1,sp packuswb m0, m2 vextracti128 xm2, m0, 1 movd [r2], xm0 pextrw [r2 + 4], xm2, 0 pextrd [r2 + r3], xm0, 1 pextrw [r2 + r3 + 4], xm2, 2 pextrd [r2 + r3 * 2], xm0, 2 pextrw [r2 + r3 * 2 + 4], xm2, 4 pextrd [r2 + r4], xm0, 3 pextrw [r2 + r4 + 4], xm2, 6 %else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movd [r2 + 8], xm0 pextrd [r2 + r3 + 8], xm0, 2 movd [r2 + r3 * 2 + 8], xm3 pextrd [r2 + r4 + 8], xm3, 2 %endif lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m7 paddd m5, m7 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r0 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m0, [r5 + 1 * mmsize] paddd m6, m0 movu xm5, [r0 + r1 * 2] ; m5 = row 10 punpckhwd xm0, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm0, 1 pmaddwd m2, [r5 + 1 * mmsize] paddd m1, m2 %ifidn %1,sp paddd m6, m7 paddd m1, m7 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vextracti128 xm6, m4, 1 movd [r2], xm4 pextrw [r2 + 4], xm6, 0 pextrd [r2 + r3], xm4, 1 pextrw [r2 + r3 + 4], xm6, 2 pextrd [r2 + r3 * 2], xm4, 2 pextrw [r2 + r3 * 2 + 4], xm6, 4 pextrd [r2 + r4], xm4, 3 pextrw [r2 + r4 + 4], xm6, 6 %else movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r4], xm6 vextracti128 xm5, m4, 1 vextracti128 xm1, m6, 1 movd [r2 + 8], xm5 pextrd [r2 + r3 + 8], xm5, 2 movd [r2 + r3 * 2 + 8], xm1 pextrd [r2 + r4 + 8], xm1, 2 %endif RET %endmacro FILTER_VER_CHROMA_S_AVX2_6x8 sp FILTER_VER_CHROMA_S_AVX2_6x8 ss %macro FILTER_VER_CHROMA_S_AVX2_6x16 1 %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal interp_4tap_vert_%1_6x16, 4, 7, 9 mov r4d, r4m shl r4d, 6 add r1d, r1d %ifdef PIC lea r5, [pw_ChromaCoeffV] add r5, r4 %else lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] sub r0, r1 %ifidn %1,sp mova m8, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] %ifidn %1,sp paddd m0, m8 paddd m1, m8 psrad m0, 12 psrad m1, 12 %else psrad m0, 6 psrad m1, 6 %endif packssdw m0, m1 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm1, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m1 %ifidn %1,sp paddd m2, m8 paddd m3, m8 psrad m2, 12 psrad m3, 12 %else psrad m2, 6 psrad m3, 6 %endif packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 vextracti128 xm2, m0, 1 movd [r2], xm0 pextrw [r2 + 4], xm2, 0 pextrd [r2 + r3], xm0, 1 pextrw [r2 + r3 + 4], xm2, 2 pextrd [r2 + r3 * 2], xm0, 2 pextrw [r2 + r3 * 2 + 4], xm2, 4 pextrd [r2 + r6], xm0, 3 pextrw [r2 + r6 + 4], xm2, 6 %else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 vextracti128 xm0, m0, 1 vextracti128 xm3, m2, 1 movd [r2 + 8], xm0 pextrd [r2 + r3 + 8], xm0, 2 movd [r2 + r3 * 2 + 8], xm3 pextrd [r2 + r6 + 8], xm3, 2 %endif lea r2, [r2 + r3 * 4] movu xm1, [r0 + r4] ; m1 = row 7 punpckhwd xm0, xm6, xm1 punpcklwd xm6, xm1 vinserti128 m6, m6, xm0, 1 pmaddwd m0, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m0 lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 8 punpckhwd xm2, xm1, xm0 punpcklwd xm1, xm0 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] pmaddwd m1, [r5] paddd m5, m2 %ifidn %1,sp paddd m4, m8 paddd m5, m8 psrad m4, 12 psrad m5, 12 %else psrad m4, 6 psrad m5, 6 %endif packssdw m4, m5 movu xm2, [r0 + r1] ; m2 = row 9 punpckhwd xm5, xm0, xm2 punpcklwd xm0, xm2 vinserti128 m0, m0, xm5, 1 pmaddwd m5, m0, [r5 + 1 * mmsize] paddd m6, m5 pmaddwd m0, [r5] movu xm5, [r0 + r1 * 2] ; m5 = row 10 punpckhwd xm7, xm2, xm5 punpcklwd xm2, xm5 vinserti128 m2, m2, xm7, 1 pmaddwd m7, m2, [r5 + 1 * mmsize] paddd m1, m7 pmaddwd m2, [r5] %ifidn %1,sp paddd m6, m8 paddd m1, m8 psrad m6, 12 psrad m1, 12 %else psrad m6, 6 psrad m1, 6 %endif packssdw m6, m1 %ifidn %1,sp packuswb m4, m6 vextracti128 xm6, m4, 1 movd [r2], xm4 pextrw [r2 + 4], xm6, 0 pextrd [r2 + r3], xm4, 1 pextrw [r2 + r3 + 4], xm6, 2 pextrd [r2 + r3 * 2], xm4, 2 pextrw [r2 + r3 * 2 + 4], xm6, 4 pextrd [r2 + r6], xm4, 3 pextrw [r2 + r6 + 4], xm6, 6 %else movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r6], xm6 vextracti128 xm4, m4, 1 vextracti128 xm1, m6, 1 movd [r2 + 8], xm4 pextrd [r2 + r3 + 8], xm4, 2 movd [r2 + r3 * 2 + 8], xm1 pextrd [r2 + r6 + 8], xm1, 2 %endif lea r2, [r2 + r3 * 4] movu xm7, [r0 + r4] ; m7 = row 11 punpckhwd xm1, xm5, xm7 punpcklwd xm5, xm7 vinserti128 m5, m5, xm1, 1 pmaddwd m1, m5, [r5 + 1 * mmsize] paddd m0, m1 pmaddwd m5, [r5] lea r0, [r0 + r1 * 4] movu xm1, [r0] ; m1 = row 12 punpckhwd xm4, xm7, xm1 punpcklwd xm7, xm1 vinserti128 m7, m7, xm4, 1 pmaddwd m4, m7, [r5 + 1 * mmsize] paddd m2, m4 pmaddwd m7, [r5] %ifidn %1,sp paddd m0, m8 paddd m2, m8 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 movu xm4, [r0 + r1] ; m4 = row 13 punpckhwd xm2, xm1, xm4 punpcklwd xm1, xm4 vinserti128 m1, m1, xm2, 1 pmaddwd m2, m1, [r5 + 1 * mmsize] paddd m5, m2 pmaddwd m1, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 14 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m7, m6 pmaddwd m4, [r5] %ifidn %1,sp paddd m5, m8 paddd m7, m8 psrad m5, 12 psrad m7, 12 %else psrad m5, 6 psrad m7, 6 %endif packssdw m5, m7 %ifidn %1,sp packuswb m0, m5 vextracti128 xm5, m0, 1 movd [r2], xm0 pextrw [r2 + 4], xm5, 0 pextrd [r2 + r3], xm0, 1 pextrw [r2 + r3 + 4], xm5, 2 pextrd [r2 + r3 * 2], xm0, 2 pextrw [r2 + r3 * 2 + 4], xm5, 4 pextrd [r2 + r6], xm0, 3 pextrw [r2 + r6 + 4], xm5, 6 %else movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm5 movhps [r2 + r6], xm5 vextracti128 xm0, m0, 1 vextracti128 xm7, m5, 1 movd [r2 + 8], xm0 pextrd [r2 + r3 + 8], xm0, 2 movd [r2 + r3 * 2 + 8], xm7 pextrd [r2 + r6 + 8], xm7, 2 %endif lea r2, [r2 + r3 * 4] movu xm6, [r0 + r4] ; m6 = row 15 punpckhwd xm5, xm2, xm6 punpcklwd xm2, xm6 vinserti128 m2, m2, xm5, 1 pmaddwd m5, m2, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm0, [r0] ; m0 = row 16 punpckhwd xm5, xm6, xm0 punpcklwd xm6, xm0 vinserti128 m6, m6, xm5, 1 pmaddwd m5, m6, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m6, [r5] %ifidn %1,sp paddd m1, m8 paddd m4, m8 psrad m1, 12 psrad m4, 12 %else psrad m1, 6 psrad m4, 6 %endif packssdw m1, m4 movu xm5, [r0 + r1] ; m5 = row 17 punpckhwd xm4, xm0, xm5 punpcklwd xm0, xm5 vinserti128 m0, m0, xm4, 1 pmaddwd m0, [r5 + 1 * mmsize] paddd m2, m0 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhwd xm0, xm5, xm4 punpcklwd xm5, xm4 vinserti128 m5, m5, xm0, 1 pmaddwd m5, [r5 + 1 * mmsize] paddd m6, m5 %ifidn %1,sp paddd m2, m8 paddd m6, m8 psrad m2, 12 psrad m6, 12 %else psrad m2, 6 psrad m6, 6 %endif packssdw m2, m6 %ifidn %1,sp packuswb m1, m2 vextracti128 xm2, m1, 1 movd [r2], xm1 pextrw [r2 + 4], xm2, 0 pextrd [r2 + r3], xm1, 1 pextrw [r2 + r3 + 4], xm2, 2 pextrd [r2 + r3 * 2], xm1, 2 pextrw [r2 + r3 * 2 + 4], xm2, 4 pextrd [r2 + r6], xm1, 3 pextrw [r2 + r6 + 4], xm2, 6 %else movq [r2], xm1 movhps [r2 + r3], xm1 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 vextracti128 xm4, m1, 1 vextracti128 xm6, m2, 1 movd [r2 + 8], xm4 pextrd [r2 + r3 + 8], xm4, 2 movd [r2 + r3 * 2 + 8], xm6 pextrd [r2 + r6 + 8], xm6, 2 %endif RET %endif %endmacro FILTER_VER_CHROMA_S_AVX2_6x16 sp FILTER_VER_CHROMA_S_AVX2_6x16 ss ;--------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SS_W2_4R 2 INIT_XMM sse4 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 add r1d, r1d add r3d, r3d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif mov r4d, (%2/4) .loopH: PROCESS_CHROMA_SP_W2_4R r5 psrad m0, 6 psrad m2, 6 packssdw m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 lea r2, [r2 + 2 * r3] pextrd [r2], m0, 2 pextrd [r2 + r3], m0, 3 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SS_W2_4R 2, 4 FILTER_VER_CHROMA_SS_W2_4R 2, 8 FILTER_VER_CHROMA_SS_W2_4R 2, 16 ;--------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- INIT_XMM sse2 cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 add r1d, r1d add r3d, r3d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m2, [r0] punpcklwd m1, m2 ;m1=[1 2] pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 movq m3, [r0 + r1] punpcklwd m2, m3 ;m4=[2 3] pmaddwd m2, [r5 + 1 * 16] paddd m0, m2 ;m0=[0+1+2+3] Row1 done psrad m0, 6 movq m2, [r0 + 2 * r1] punpcklwd m3, m2 ;m5=[3 4] pmaddwd m3, [r5 + 1 * 16] paddd m1, m3 ;m1=[1+2+3+4] Row2 done psrad m1, 6 packssdw m0, m1 movlps [r2], m0 movhps [r2 + r3], m0 RET ;------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;------------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SS_W6_H4 2 INIT_XMM sse4 cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 add r1d, r1d add r3d, r3d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_ChromaCoeffV + r4] %endif mov r4d, %2/4 .loopH: PROCESS_CHROMA_SP_W4_4R psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 packssdw m0, m1 packssdw m2, m3 movlps [r2], m0 movhps [r2 + r3], m0 lea r5, [r2 + 2 * r3] movlps [r5], m2 movhps [r5 + r3], m2 lea r5, [4 * r1 - 2 * 4] sub r0, r5 add r2, 2 * 4 PROCESS_CHROMA_SP_W2_4R r6 psrad m0, 6 psrad m2, 6 packssdw m0, m2 movd [r2], m0 pextrd [r2 + r3], m0, 1 lea r2, [r2 + 2 * r3] pextrd [r2], m0, 2 pextrd [r2 + r3], m0, 3 sub r0, 2 * 4 lea r2, [r2 + 2 * r3 - 2 * 4] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SS_W6_H4 6, 8 FILTER_VER_CHROMA_SS_W6_H4 6, 16 ;---------------------------------------------------------------------------------------------------------------- ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;---------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_CHROMA_SS_W8_H2 2 INIT_XMM sse2 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 add r1d, r1d add r3d, r3d sub r0, r1 shl r4d, 5 %ifdef PIC lea r5, [tab_ChromaCoeffV] lea r5, [r5 + r4] %else lea r5, [tab_ChromaCoeffV + r4] %endif mov r4d, %2/2 .loopH: PROCESS_CHROMA_SP_W8_2R psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 packssdw m0, m1 packssdw m2, m3 movu [r2], m0 movu [r2 + r3], m2 lea r2, [r2 + 2 * r3] dec r4d jnz .loopH RET %endmacro FILTER_VER_CHROMA_SS_W8_H2 8, 2 FILTER_VER_CHROMA_SS_W8_H2 8, 4 FILTER_VER_CHROMA_SS_W8_H2 8, 6 FILTER_VER_CHROMA_SS_W8_H2 8, 8 FILTER_VER_CHROMA_SS_W8_H2 8, 16 FILTER_VER_CHROMA_SS_W8_H2 8, 32 FILTER_VER_CHROMA_SS_W8_H2 8, 12 FILTER_VER_CHROMA_SS_W8_H2 8, 64 ;----------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------------------------------------------- %macro FILTER_VER_LUMA_SS 2 INIT_XMM sse2 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize add r1d, r1d add r3d, r3d lea r5, [3 * r1] sub r0, r5 shl r4d, 6 %ifdef PIC lea r5, [tab_LumaCoeffV] lea r6, [r5 + r4] %else lea r6, [tab_LumaCoeffV + r4] %endif mov dword [rsp], %2/4 .loopH: mov r4d, (%1/4) .loopW: movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m1, m4 ;m1=[1 2] pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[2 3] pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 pmaddwd m4, [r6 + 1 * 16] paddd m0, m4 ;m0=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[3 4] pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 pmaddwd m5, [r6 + 1 * 16] paddd m1, m5 ;m1 = [1+2+3+4] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[4 5] pmaddwd m6, m4, [r6 + 1 * 16] paddd m2, m6 ;m2=[2+3+4+5] Row3 pmaddwd m4, [r6 + 2 * 16] paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[5 6] pmaddwd m6, m5, [r6 + 1 * 16] paddd m3, m6 ;m3=[3+4+5+6] Row4 pmaddwd m5, [r6 + 2 * 16] paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[6 7] pmaddwd m6, m4, [r6 + 2 * 16] paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 pmaddwd m4, [r6 + 3 * 16] paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end psrad m0, 6 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[7 8] pmaddwd m6, m5, [r6 + 2 * 16] paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 pmaddwd m5, [r6 + 3 * 16] paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end psrad m1, 6 packssdw m0, m1 movlps [r2], m0 movhps [r2 + r3], m0 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[8 9] pmaddwd m4, [r6 + 3 * 16] paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end psrad m2, 6 movq m4, [r0 + 2 * r1] punpcklwd m5, m4 ;m5=[9 10] pmaddwd m5, [r6 + 3 * 16] paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end psrad m3, 6 packssdw m2, m3 movlps [r2 + 2 * r3], m2 lea r5, [3 * r3] movhps [r2 + r5], m2 lea r5, [8 * r1 - 2 * 4] sub r0, r5 add r2, 2 * 4 dec r4d jnz .loopW lea r0, [r0 + 4 * r1 - 2 * %1] lea r2, [r2 + 4 * r3 - 2 * %1] dec dword [rsp] jnz .loopH RET %endmacro FILTER_VER_LUMA_SS 4, 4 FILTER_VER_LUMA_SS 8, 8 FILTER_VER_LUMA_SS 8, 4 FILTER_VER_LUMA_SS 4, 8 FILTER_VER_LUMA_SS 16, 16 FILTER_VER_LUMA_SS 16, 8 FILTER_VER_LUMA_SS 8, 16 FILTER_VER_LUMA_SS 16, 12 FILTER_VER_LUMA_SS 12, 16 FILTER_VER_LUMA_SS 16, 4 FILTER_VER_LUMA_SS 4, 16 FILTER_VER_LUMA_SS 32, 32 FILTER_VER_LUMA_SS 32, 16 FILTER_VER_LUMA_SS 16, 32 FILTER_VER_LUMA_SS 32, 24 FILTER_VER_LUMA_SS 24, 32 FILTER_VER_LUMA_SS 32, 8 FILTER_VER_LUMA_SS 8, 32 FILTER_VER_LUMA_SS 64, 64 FILTER_VER_LUMA_SS 64, 32 FILTER_VER_LUMA_SS 32, 64 FILTER_VER_LUMA_SS 64, 48 FILTER_VER_LUMA_SS 48, 64 FILTER_VER_LUMA_SS 64, 16 FILTER_VER_LUMA_SS 16, 64 %macro FILTER_VER_LUMA_AVX2_4x4 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 mov r4d, r4m add r1d, r1d shl r4d, 7 %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m6, [pd_526336] %else add r3d, r3d %endif movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m5, m4, [r5 + 2 * mmsize] pmaddwd m4, [r5 + 1 * mmsize] paddd m0, m5 paddd m2, m4 movq xm3, [r0 + r4] punpcklwd xm1, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] pmaddwd m5, m1, [r5 + 3 * mmsize] pmaddwd m1, [r5 + 2 * mmsize] paddd m0, m5 paddd m2, m1 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + 2 * r1] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] pmaddwd m4, [r5 + 3 * mmsize] paddd m2, m4 %ifidn %1,sp paddd m0, m6 paddd m2, m6 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 vextracti128 xm2, m0, 1 lea r4, [r3 * 3] %ifidn %1,sp packuswb xm0, xm2 movd [r2], xm0 pextrd [r2 + r3], xm0, 2 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r4], xm0, 3 %else movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r4], xm2 %endif RET %endmacro FILTER_VER_LUMA_AVX2_4x4 sp FILTER_VER_LUMA_AVX2_4x4 ss %macro FILTER_VER_LUMA_AVX2_4x8 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m5, m4, [r5 + 2 * mmsize] paddd m0, m5 pmaddwd m5, m4, [r5 + 1 * mmsize] paddd m2, m5 pmaddwd m4, [r5] movq xm3, [r0 + r4] punpcklwd xm1, xm3 lea r0, [r0 + 4 * r1] movq xm6, [r0] punpcklwd xm3, xm6 vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] pmaddwd m5, m1, [r5 + 3 * mmsize] paddd m0, m5 pmaddwd m5, m1, [r5 + 2 * mmsize] paddd m2, m5 pmaddwd m5, m1, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m1, [r5] movq xm3, [r0 + r1] punpcklwd xm6, xm3 movq xm5, [r0 + 2 * r1] punpcklwd xm3, xm5 vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] pmaddwd m3, m6, [r5 + 3 * mmsize] paddd m2, m3 pmaddwd m3, m6, [r5 + 2 * mmsize] paddd m4, m3 pmaddwd m6, [r5 + 1 * mmsize] paddd m1, m6 %ifidn %1,sp paddd m0, m7 paddd m2, m7 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 movq xm3, [r0 + r4] punpcklwd xm5, xm3 lea r0, [r0 + 4 * r1] movq xm2, [r0] punpcklwd xm3, xm2 vinserti128 m5, m5, xm3, 1 ; m5 = [C B B A] pmaddwd m3, m5, [r5 + 3 * mmsize] paddd m4, m3 pmaddwd m5, [r5 + 2 * mmsize] paddd m1, m5 movq xm3, [r0 + r1] punpcklwd xm2, xm3 movq xm5, [r0 + 2 * r1] punpcklwd xm3, xm5 vinserti128 m2, m2, xm3, 1 ; m2 = [E D D C] pmaddwd m2, [r5 + 3 * mmsize] paddd m1, m2 %ifidn %1,sp paddd m4, m7 paddd m1, m7 psrad m4, 12 psrad m1, 12 %else psrad m4, 6 psrad m1, 6 %endif packssdw m4, m1 %ifidn %1,sp packuswb m0, m4 vextracti128 xm2, m0, 1 movd [r2], xm0 movd [r2 + r3], xm2 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r6], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm0, 2 pextrd [r2 + r3], xm2, 2 pextrd [r2 + r3 * 2], xm0, 3 pextrd [r2 + r6], xm2, 3 %else vextracti128 xm2, m0, 1 vextracti128 xm1, m4, 1 movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm2 lea r2, [r2 + r3 * 4] movq [r2], xm4 movq [r2 + r3], xm1 movhps [r2 + r3 * 2], xm4 movhps [r2 + r6], xm1 %endif RET %endmacro FILTER_VER_LUMA_AVX2_4x8 sp FILTER_VER_LUMA_AVX2_4x8 ss %macro PROCESS_LUMA_AVX2_W4_16R 1 movq xm0, [r0] movq xm1, [r0 + r1] punpcklwd xm0, xm1 movq xm2, [r0 + r1 * 2] punpcklwd xm1, xm2 vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] pmaddwd m0, [r5] movq xm3, [r0 + r4] punpcklwd xm2, xm3 lea r0, [r0 + 4 * r1] movq xm4, [r0] punpcklwd xm3, xm4 vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] pmaddwd m5, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m5 movq xm3, [r0 + r1] punpcklwd xm4, xm3 movq xm1, [r0 + r1 * 2] punpcklwd xm3, xm1 vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] pmaddwd m5, m4, [r5 + 2 * mmsize] paddd m0, m5 pmaddwd m5, m4, [r5 + 1 * mmsize] paddd m2, m5 pmaddwd m4, [r5] movq xm3, [r0 + r4] punpcklwd xm1, xm3 lea r0, [r0 + 4 * r1] movq xm6, [r0] punpcklwd xm3, xm6 vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] pmaddwd m5, m1, [r5 + 3 * mmsize] paddd m0, m5 pmaddwd m5, m1, [r5 + 2 * mmsize] paddd m2, m5 pmaddwd m5, m1, [r5 + 1 * mmsize] paddd m4, m5 pmaddwd m1, [r5] movq xm3, [r0 + r1] punpcklwd xm6, xm3 movq xm5, [r0 + 2 * r1] punpcklwd xm3, xm5 vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] pmaddwd m3, m6, [r5 + 3 * mmsize] paddd m2, m3 pmaddwd m3, m6, [r5 + 2 * mmsize] paddd m4, m3 pmaddwd m3, m6, [r5 + 1 * mmsize] paddd m1, m3 pmaddwd m6, [r5] %ifidn %1,sp paddd m0, m7 paddd m2, m7 psrad m0, 12 psrad m2, 12 %else psrad m0, 6 psrad m2, 6 %endif packssdw m0, m2 vextracti128 xm2, m0, 1 %ifidn %1,sp packuswb xm0, xm2 movd [r2], xm0 pextrd [r2 + r3], xm0, 2 pextrd [r2 + r3 * 2], xm0, 1 pextrd [r2 + r6], xm0, 3 %else movq [r2], xm0 movq [r2 + r3], xm2 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm2 %endif movq xm2, [r0 + r4] punpcklwd xm5, xm2 lea r0, [r0 + 4 * r1] movq xm0, [r0] punpcklwd xm2, xm0 vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] pmaddwd m2, m5, [r5 + 3 * mmsize] paddd m4, m2 pmaddwd m2, m5, [r5 + 2 * mmsize] paddd m1, m2 pmaddwd m2, m5, [r5 + 1 * mmsize] paddd m6, m2 pmaddwd m5, [r5] movq xm2, [r0 + r1] punpcklwd xm0, xm2 movq xm3, [r0 + 2 * r1] punpcklwd xm2, xm3 vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] pmaddwd m2, m0, [r5 + 3 * mmsize] paddd m1, m2 pmaddwd m2, m0, [r5 + 2 * mmsize] paddd m6, m2 pmaddwd m2, m0, [r5 + 1 * mmsize] paddd m5, m2 pmaddwd m0, [r5] %ifidn %1,sp paddd m4, m7 paddd m1, m7 psrad m4, 12 psrad m1, 12 %else psrad m4, 6 psrad m1, 6 %endif packssdw m4, m1 vextracti128 xm1, m4, 1 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb xm4, xm1 movd [r2], xm4 pextrd [r2 + r3], xm4, 2 pextrd [r2 + r3 * 2], xm4, 1 pextrd [r2 + r6], xm4, 3 %else movq [r2], xm4 movq [r2 + r3], xm1 movhps [r2 + r3 * 2], xm4 movhps [r2 + r6], xm1 %endif movq xm4, [r0 + r4] punpcklwd xm3, xm4 lea r0, [r0 + 4 * r1] movq xm1, [r0] punpcklwd xm4, xm1 vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] pmaddwd m4, m3, [r5 + 3 * mmsize] paddd m6, m4 pmaddwd m4, m3, [r5 + 2 * mmsize] paddd m5, m4 pmaddwd m4, m3, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m3, [r5] movq xm4, [r0 + r1] punpcklwd xm1, xm4 movq xm2, [r0 + 2 * r1] punpcklwd xm4, xm2 vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] pmaddwd m4, m1, [r5 + 3 * mmsize] paddd m5, m4 pmaddwd m4, m1, [r5 + 2 * mmsize] paddd m0, m4 pmaddwd m1, [r5 + 1 * mmsize] paddd m3, m1 movq xm4, [r0 + r4] punpcklwd xm2, xm4 lea r0, [r0 + 4 * r1] movq xm1, [r0] punpcklwd xm4, xm1 vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] pmaddwd m4, m2, [r5 + 3 * mmsize] paddd m0, m4 pmaddwd m2, [r5 + 2 * mmsize] paddd m3, m2 movq xm4, [r0 + r1] punpcklwd xm1, xm4 movq xm2, [r0 + 2 * r1] punpcklwd xm4, xm2 vinserti128 m1, m1, xm4, 1 ; m1 = [22 21 21 20] pmaddwd m1, [r5 + 3 * mmsize] paddd m3, m1 %ifidn %1,sp paddd m6, m7 paddd m5, m7 paddd m0, m7 paddd m3, m7 psrad m6, 12 psrad m5, 12 psrad m0, 12 psrad m3, 12 %else psrad m6, 6 psrad m5, 6 psrad m0, 6 psrad m3, 6 %endif packssdw m6, m5 packssdw m0, m3 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m6, m0 vextracti128 xm0, m6, 1 movd [r2], xm6 movd [r2 + r3], xm0 pextrd [r2 + r3 * 2], xm6, 1 pextrd [r2 + r6], xm0, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm6, 2 pextrd [r2 + r3], xm0, 2 pextrd [r2 + r3 * 2], xm6, 3 pextrd [r2 + r6], xm0, 3 %else vextracti128 xm5, m6, 1 vextracti128 xm3, m0, 1 movq [r2], xm6 movq [r2 + r3], xm5 movhps [r2 + r3 * 2], xm6 movhps [r2 + r6], xm5 lea r2, [r2 + r3 * 4] movq [r2], xm0 movq [r2 + r3], xm3 movhps [r2 + r3 * 2], xm0 movhps [r2 + r6], xm3 %endif %endmacro %macro FILTER_VER_LUMA_AVX2_4x16 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] PROCESS_LUMA_AVX2_W4_16R %1 RET %endmacro FILTER_VER_LUMA_AVX2_4x16 sp FILTER_VER_LUMA_AVX2_4x16 ss %macro FILTER_VER_LUMA_S_AVX2_8x8 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m11, [pd_526336] %else add r3d, r3d %endif movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] pmaddwd m2, [r5] paddd m0, m4 lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] pmaddwd m3, [r5] paddd m1, m5 movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] pmaddwd m5, [r5] paddd m3, m7 movu xm7, [r0 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 3 * mmsize] paddd m0, m8 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] pmaddwd m6, [r5] paddd m4, m8 lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddwd m9, m7, [r5 + 3 * mmsize] paddd m1, m9 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] pmaddwd m7, [r5] paddd m5, m9 movu xm9, [r0 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] pmaddwd m8, [r5 + 1 * mmsize] paddd m4, m10 paddd m6, m8 movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhwd xm8, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm8, 1 pmaddwd m8, m9, [r5 + 3 * mmsize] paddd m3, m8 pmaddwd m8, m9, [r5 + 2 * mmsize] pmaddwd m9, [r5 + 1 * mmsize] paddd m5, m8 paddd m7, m9 movu xm8, [r0 + r4] ; m8 = row 11 punpckhwd xm9, xm10, xm8 punpcklwd xm10, xm8 vinserti128 m10, m10, xm9, 1 pmaddwd m9, m10, [r5 + 3 * mmsize] pmaddwd m10, [r5 + 2 * mmsize] paddd m4, m9 paddd m6, m10 lea r4, [r3 * 3] %ifidn %1,sp paddd m0, m11 paddd m1, m11 paddd m2, m11 paddd m3, m11 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m1, [interp8_hps_shuf] vpermd m0, m1, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 %endif lea r0, [r0 + r1 * 4] movu xm9, [r0] ; m9 = row 12 punpckhwd xm3, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm3, 1 pmaddwd m3, m8, [r5 + 3 * mmsize] pmaddwd m8, [r5 + 2 * mmsize] paddd m5, m3 paddd m7, m8 movu xm3, [r0 + r1] ; m3 = row 13 punpckhwd xm0, xm9, xm3 punpcklwd xm9, xm3 vinserti128 m9, m9, xm0, 1 pmaddwd m9, [r5 + 3 * mmsize] paddd m6, m9 movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhwd xm9, xm3, xm0 punpcklwd xm3, xm0 vinserti128 m3, m3, xm9, 1 pmaddwd m3, [r5 + 3 * mmsize] paddd m7, m3 %ifidn %1,sp paddd m4, m11 paddd m5, m11 paddd m6, m11 paddd m7, m11 psrad m4, 12 psrad m5, 12 psrad m6, 12 psrad m7, 12 %else psrad m4, 6 psrad m5, 6 psrad m6, 6 psrad m7, 6 %endif packssdw m4, m5 packssdw m6, m7 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m1, m4 vextracti128 xm6, m4, 1 movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r4], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r2], xm4 movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r4], xm7 %endif RET %endif %endmacro FILTER_VER_LUMA_S_AVX2_8x8 sp FILTER_VER_LUMA_S_AVX2_8x8 ss %macro FILTER_VER_LUMA_S_AVX2_8xN 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] lea r7, [r1 * 4] mov r8d, %2 / 16 .loopH: movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] movu xm7, [r0 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 3 * mmsize] paddd m0, m8 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] lea r0, [r0 + r1 * 4] movu xm8, [r0] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddwd m9, m7, [r5 + 3 * mmsize] paddd m1, m9 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] paddd m5, m9 pmaddwd m7, [r5] movu xm9, [r0 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 pmaddwd m10, m8, [r5 + 1 * mmsize] paddd m6, m10 pmaddwd m8, [r5] movu xm10, [r0 + r1 * 2] ; m10 = row 10 punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddwd m11, m9, [r5 + 3 * mmsize] paddd m3, m11 pmaddwd m11, m9, [r5 + 2 * mmsize] paddd m5, m11 pmaddwd m11, m9, [r5 + 1 * mmsize] paddd m7, m11 pmaddwd m9, [r5] movu xm11, [r0 + r4] ; m11 = row 11 punpckhwd xm12, xm10, xm11 punpcklwd xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddwd m12, m10, [r5 + 3 * mmsize] paddd m4, m12 pmaddwd m12, m10, [r5 + 2 * mmsize] paddd m6, m12 pmaddwd m12, m10, [r5 + 1 * mmsize] paddd m8, m12 pmaddwd m10, [r5] lea r0, [r0 + r1 * 4] movu xm12, [r0] ; m12 = row 12 punpckhwd xm13, xm11, xm12 punpcklwd xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddwd m13, m11, [r5 + 3 * mmsize] paddd m5, m13 pmaddwd m13, m11, [r5 + 2 * mmsize] paddd m7, m13 pmaddwd m13, m11, [r5 + 1 * mmsize] paddd m9, m13 pmaddwd m11, [r5] %ifidn %1,sp paddd m0, m14 paddd m1, m14 paddd m2, m14 paddd m3, m14 paddd m4, m14 paddd m5, m14 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 psrad m4, 12 psrad m5, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 psrad m4, 6 psrad m5, 6 %endif packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 %ifidn %1,sp packuswb m0, m2 mova m1, [interp8_hps_shuf] vpermd m0, m1, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif movu xm13, [r0 + r1] ; m13 = row 13 punpckhwd xm0, xm12, xm13 punpcklwd xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddwd m0, m12, [r5 + 3 * mmsize] paddd m6, m0 pmaddwd m0, m12, [r5 + 2 * mmsize] paddd m8, m0 pmaddwd m0, m12, [r5 + 1 * mmsize] paddd m10, m0 pmaddwd m12, [r5] movu xm0, [r0 + r1 * 2] ; m0 = row 14 punpckhwd xm2, xm13, xm0 punpcklwd xm13, xm0 vinserti128 m13, m13, xm2, 1 pmaddwd m2, m13, [r5 + 3 * mmsize] paddd m7, m2 pmaddwd m2, m13, [r5 + 2 * mmsize] paddd m9, m2 pmaddwd m2, m13, [r5 + 1 * mmsize] paddd m11, m2 pmaddwd m13, [r5] %ifidn %1,sp paddd m6, m14 paddd m7, m14 psrad m6, 12 psrad m7, 12 %else psrad m6, 6 psrad m7, 6 %endif packssdw m6, m7 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m1, m4 vextracti128 xm6, m4, 1 movq [r2], xm4 movhps [r2 + r3], xm4 movq [r2 + r3 * 2], xm6 movhps [r2 + r6], xm6 %else vpermq m6, m6, 11011000b vpermq m4, m4, 11011000b vextracti128 xm1, m4, 1 vextracti128 xm7, m6, 1 movu [r2], xm4 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm6 movu [r2 + r6], xm7 %endif movu xm6, [r0 + r4] ; m6 = row 15 punpckhwd xm5, xm0, xm6 punpcklwd xm0, xm6 vinserti128 m0, m0, xm5, 1 pmaddwd m5, m0, [r5 + 3 * mmsize] paddd m8, m5 pmaddwd m5, m0, [r5 + 2 * mmsize] paddd m10, m5 pmaddwd m5, m0, [r5 + 1 * mmsize] paddd m12, m5 pmaddwd m0, [r5] lea r0, [r0 + r1 * 4] movu xm2, [r0] ; m2 = row 16 punpckhwd xm3, xm6, xm2 punpcklwd xm6, xm2 vinserti128 m6, m6, xm3, 1 pmaddwd m3, m6, [r5 + 3 * mmsize] paddd m9, m3 pmaddwd m3, m6, [r5 + 2 * mmsize] paddd m11, m3 pmaddwd m3, m6, [r5 + 1 * mmsize] paddd m13, m3 pmaddwd m6, [r5] movu xm3, [r0 + r1] ; m3 = row 17 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 3 * mmsize] paddd m10, m4 pmaddwd m4, m2, [r5 + 2 * mmsize] paddd m12, m4 pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 movu xm4, [r0 + r1 * 2] ; m4 = row 18 punpckhwd xm2, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm2, 1 pmaddwd m2, m3, [r5 + 3 * mmsize] paddd m11, m2 pmaddwd m2, m3, [r5 + 2 * mmsize] paddd m13, m2 pmaddwd m3, [r5 + 1 * mmsize] paddd m6, m3 movu xm2, [r0 + r4] ; m2 = row 19 punpckhwd xm7, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm7, 1 pmaddwd m7, m4, [r5 + 3 * mmsize] paddd m12, m7 pmaddwd m4, [r5 + 2 * mmsize] paddd m0, m4 lea r0, [r0 + r1 * 4] movu xm7, [r0] ; m7 = row 20 punpckhwd xm3, xm2, xm7 punpcklwd xm2, xm7 vinserti128 m2, m2, xm3, 1 pmaddwd m3, m2, [r5 + 3 * mmsize] paddd m13, m3 pmaddwd m2, [r5 + 2 * mmsize] paddd m6, m2 movu xm3, [r0 + r1] ; m3 = row 21 punpckhwd xm2, xm7, xm3 punpcklwd xm7, xm3 vinserti128 m7, m7, xm2, 1 pmaddwd m7, [r5 + 3 * mmsize] paddd m0, m7 movu xm2, [r0 + r1 * 2] ; m2 = row 22 punpckhwd xm7, xm3, xm2 punpcklwd xm3, xm2 vinserti128 m3, m3, xm7, 1 pmaddwd m3, [r5 + 3 * mmsize] paddd m6, m3 %ifidn %1,sp paddd m8, m14 paddd m9, m14 paddd m10, m14 paddd m11, m14 paddd m12, m14 paddd m13, m14 paddd m0, m14 paddd m6, m14 psrad m8, 12 psrad m9, 12 psrad m10, 12 psrad m11, 12 psrad m12, 12 psrad m13, 12 psrad m0, 12 psrad m6, 12 %else psrad m8, 6 psrad m9, 6 psrad m10, 6 psrad m11, 6 psrad m12, 6 psrad m13, 6 psrad m0, 6 psrad m6, 6 %endif packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m0, m6 lea r2, [r2 + r3 * 4] %ifidn %1,sp packuswb m8, m10 packuswb m12, m0 vpermd m8, m1, m8 vpermd m12, m1, m12 vextracti128 xm10, m8, 1 vextracti128 xm0, m12, 1 movq [r2], xm8 movhps [r2 + r3], xm8 movq [r2 + r3 * 2], xm10 movhps [r2 + r6], xm10 lea r2, [r2 + r3 * 4] movq [r2], xm12 movhps [r2 + r3], xm12 movq [r2 + r3 * 2], xm0 movhps [r2 + r6], xm0 %else vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm6, m0, 1 movu [r2], xm8 movu [r2 + r3], xm9 movu [r2 + r3 * 2], xm10 movu [r2 + r6], xm11 lea r2, [r2 + r3 * 4] movu [r2], xm12 movu [r2 + r3], xm13 movu [r2 + r3 * 2], xm0 movu [r2 + r6], xm6 %endif lea r2, [r2 + r3 * 4] sub r0, r7 dec r8d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_S_AVX2_8xN sp, 16 FILTER_VER_LUMA_S_AVX2_8xN sp, 32 FILTER_VER_LUMA_S_AVX2_8xN ss, 16 FILTER_VER_LUMA_S_AVX2_8xN ss, 32 %macro PROCESS_LUMA_S_AVX2_W8_4R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r0, [r0 + r1 * 4] movu xm4, [r0] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r0 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m4, [r5 + 1 * mmsize] paddd m2, m4 movu xm6, [r0 + r1 * 2] ; m6 = row 6 punpckhwd xm4, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm4, 1 pmaddwd m4, m5, [r5 + 2 * mmsize] paddd m1, m4 pmaddwd m5, [r5 + 1 * mmsize] paddd m3, m5 movu xm4, [r0 + r4] ; m4 = row 7 punpckhwd xm5, xm6, xm4 punpcklwd xm6, xm4 vinserti128 m6, m6, xm5, 1 pmaddwd m5, m6, [r5 + 3 * mmsize] paddd m0, m5 pmaddwd m6, [r5 + 2 * mmsize] paddd m2, m6 lea r0, [r0 + r1 * 4] movu xm5, [r0] ; m5 = row 8 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 3 * mmsize] paddd m1, m6 pmaddwd m4, [r5 + 2 * mmsize] paddd m3, m4 movu xm6, [r0 + r1] ; m6 = row 9 punpckhwd xm4, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm4, 1 pmaddwd m5, [r5 + 3 * mmsize] paddd m2, m5 movu xm4, [r0 + r1 * 2] ; m4 = row 10 punpckhwd xm5, xm6, xm4 punpcklwd xm6, xm4 vinserti128 m6, m6, xm5, 1 pmaddwd m6, [r5 + 3 * mmsize] paddd m3, m6 %ifidn %1,sp paddd m0, m7 paddd m1, m7 paddd m2, m7 paddd m3, m7 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 %endif packssdw m0, m1 packssdw m2, m3 %ifidn %1,sp packuswb m0, m2 mova m4, [interp8_hps_shuf] vpermd m0, m4, m0 vextracti128 xm2, m0, 1 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 %endif %endmacro %macro FILTER_VER_LUMA_S_AVX2_8x4 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif PROCESS_LUMA_S_AVX2_W8_4R %1 lea r4, [r3 * 3] %ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r4], xm2 %else movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r4], xm3 %endif RET %endmacro FILTER_VER_LUMA_S_AVX2_8x4 sp FILTER_VER_LUMA_S_AVX2_8x4 ss %macro PROCESS_LUMA_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] movu xm7, [r7 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 3 * mmsize] paddd m0, m8 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddwd m9, m7, [r5 + 3 * mmsize] paddd m1, m9 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] paddd m5, m9 pmaddwd m7, [r5] movu xm9, [r7 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 pmaddwd m10, m8, [r5 + 1 * mmsize] paddd m6, m10 pmaddwd m8, [r5] movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddwd m11, m9, [r5 + 3 * mmsize] paddd m3, m11 pmaddwd m11, m9, [r5 + 2 * mmsize] paddd m5, m11 pmaddwd m11, m9, [r5 + 1 * mmsize] paddd m7, m11 pmaddwd m9, [r5] movu xm11, [r7 + r4] ; m11 = row 11 punpckhwd xm12, xm10, xm11 punpcklwd xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddwd m12, m10, [r5 + 3 * mmsize] paddd m4, m12 pmaddwd m12, m10, [r5 + 2 * mmsize] paddd m6, m12 pmaddwd m12, m10, [r5 + 1 * mmsize] paddd m8, m12 pmaddwd m10, [r5] lea r7, [r7 + r1 * 4] movu xm12, [r7] ; m12 = row 12 punpckhwd xm13, xm11, xm12 punpcklwd xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddwd m13, m11, [r5 + 3 * mmsize] paddd m5, m13 pmaddwd m13, m11, [r5 + 2 * mmsize] paddd m7, m13 pmaddwd m13, m11, [r5 + 1 * mmsize] paddd m9, m13 pmaddwd m11, [r5] %ifidn %1,sp paddd m0, m14 paddd m1, m14 paddd m2, m14 paddd m3, m14 paddd m4, m14 paddd m5, m14 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 psrad m4, 12 psrad m5, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 psrad m4, 6 psrad m5, 6 %endif packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 %ifidn %1,sp packuswb m0, m2 mova m5, [interp8_hps_shuf] vpermd m0, m5, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif movu xm13, [r7 + r1] ; m13 = row 13 punpckhwd xm0, xm12, xm13 punpcklwd xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddwd m0, m12, [r5 + 3 * mmsize] paddd m6, m0 pmaddwd m0, m12, [r5 + 2 * mmsize] paddd m8, m0 pmaddwd m0, m12, [r5 + 1 * mmsize] paddd m10, m0 pmaddwd m12, [r5] movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhwd xm1, xm13, xm0 punpcklwd xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddwd m1, m13, [r5 + 3 * mmsize] paddd m7, m1 pmaddwd m1, m13, [r5 + 2 * mmsize] paddd m9, m1 pmaddwd m1, m13, [r5 + 1 * mmsize] paddd m11, m1 pmaddwd m13, [r5] %ifidn %1,sp paddd m6, m14 paddd m7, m14 psrad m6, 12 psrad m7, 12 %else psrad m6, 6 psrad m7, 6 %endif packssdw m6, m7 lea r8, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m5, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm1, m4, 1 vextracti128 xm7, m6, 1 movu [r8], xm4 movu [r8 + r3], xm1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %endif movu xm1, [r7 + r4] ; m1 = row 15 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m2, m0, [r5 + 3 * mmsize] paddd m8, m2 pmaddwd m2, m0, [r5 + 2 * mmsize] paddd m10, m2 pmaddwd m2, m0, [r5 + 1 * mmsize] paddd m12, m2 pmaddwd m0, [r5] lea r7, [r7 + r1 * 4] movu xm2, [r7] ; m2 = row 16 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m3, m1, [r5 + 3 * mmsize] paddd m9, m3 pmaddwd m3, m1, [r5 + 2 * mmsize] paddd m11, m3 pmaddwd m3, m1, [r5 + 1 * mmsize] paddd m13, m3 pmaddwd m1, [r5] movu xm3, [r7 + r1] ; m3 = row 17 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 3 * mmsize] paddd m10, m4 pmaddwd m4, m2, [r5 + 2 * mmsize] paddd m12, m4 pmaddwd m2, [r5 + 1 * mmsize] paddd m0, m2 movu xm4, [r7 + r1 * 2] ; m4 = row 18 punpckhwd xm2, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm2, 1 pmaddwd m2, m3, [r5 + 3 * mmsize] paddd m11, m2 pmaddwd m2, m3, [r5 + 2 * mmsize] paddd m13, m2 pmaddwd m3, [r5 + 1 * mmsize] paddd m1, m3 movu xm2, [r7 + r4] ; m2 = row 19 punpckhwd xm6, xm4, xm2 punpcklwd xm4, xm2 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 3 * mmsize] paddd m12, m6 pmaddwd m4, [r5 + 2 * mmsize] paddd m0, m4 lea r7, [r7 + r1 * 4] movu xm6, [r7] ; m6 = row 20 punpckhwd xm7, xm2, xm6 punpcklwd xm2, xm6 vinserti128 m2, m2, xm7, 1 pmaddwd m7, m2, [r5 + 3 * mmsize] paddd m13, m7 pmaddwd m2, [r5 + 2 * mmsize] paddd m1, m2 movu xm7, [r7 + r1] ; m7 = row 21 punpckhwd xm2, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm2, 1 pmaddwd m6, [r5 + 3 * mmsize] paddd m0, m6 movu xm2, [r7 + r1 * 2] ; m2 = row 22 punpckhwd xm3, xm7, xm2 punpcklwd xm7, xm2 vinserti128 m7, m7, xm3, 1 pmaddwd m7, [r5 + 3 * mmsize] paddd m1, m7 %ifidn %1,sp paddd m8, m14 paddd m9, m14 paddd m10, m14 paddd m11, m14 paddd m12, m14 paddd m13, m14 paddd m0, m14 paddd m1, m14 psrad m8, 12 psrad m9, 12 psrad m10, 12 psrad m11, 12 psrad m12, 12 psrad m13, 12 psrad m0, 12 psrad m1, 12 %else psrad m8, 6 psrad m9, 6 psrad m10, 6 psrad m11, 6 psrad m12, 6 psrad m13, 6 psrad m0, 6 psrad m1, 6 %endif packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m0, m1 lea r8, [r8 + r3 * 4] %ifidn %1,sp packuswb m8, m10 packuswb m12, m0 vpermd m8, m5, m8 vpermd m12, m5, m12 vextracti128 xm10, m8, 1 vextracti128 xm0, m12, 1 movq [r8], xm8 movhps [r8 + r3], xm8 movq [r8 + r3 * 2], xm10 movhps [r8 + r6], xm10 lea r8, [r8 + r3 * 4] movq [r8], xm12 movhps [r8 + r3], xm12 movq [r8 + r3 * 2], xm0 movhps [r8 + r6], xm0 %else vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vpermq m12, m12, 11011000b vpermq m0, m0, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 vextracti128 xm13, m12, 1 vextracti128 xm1, m0, 1 movu [r8], xm8 movu [r8 + r3], xm9 movu [r8 + r3 * 2], xm10 movu [r8 + r6], xm11 lea r8, [r8 + r3 * 4] movu [r8], xm12 movu [r8 + r3], xm13 movu [r8 + r3 * 2], xm0 movu [r8 + r6], xm1 %endif %endmacro %macro FILTER_VER_LUMA_AVX2_Nx16 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: PROCESS_LUMA_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_AVX2_Nx16 sp, 16 FILTER_VER_LUMA_AVX2_Nx16 sp, 32 FILTER_VER_LUMA_AVX2_Nx16 sp, 64 FILTER_VER_LUMA_AVX2_Nx16 ss, 16 FILTER_VER_LUMA_AVX2_Nx16 ss, 32 FILTER_VER_LUMA_AVX2_Nx16 ss, 64 %macro FILTER_VER_LUMA_AVX2_NxN 3 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %3,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] lea r11, [r1 * 4] mov r9d, %2 / 16 .loopH: mov r10d, %1 / 8 .loopW: PROCESS_LUMA_AVX2_W8_16R %3 %ifidn %3,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r10d jnz .loopW sub r7, r11 lea r0, [r7 - 2 * %1 + 16] %ifidn %3,sp lea r2, [r8 + r3 * 4 - %1 + 8] %else lea r2, [r8 + r3 * 4 - 2 * %1 + 16] %endif dec r9d jnz .loopH RET %endif %endmacro FILTER_VER_LUMA_AVX2_NxN 16, 32, sp FILTER_VER_LUMA_AVX2_NxN 16, 64, sp FILTER_VER_LUMA_AVX2_NxN 24, 32, sp FILTER_VER_LUMA_AVX2_NxN 32, 32, sp FILTER_VER_LUMA_AVX2_NxN 32, 64, sp FILTER_VER_LUMA_AVX2_NxN 48, 64, sp FILTER_VER_LUMA_AVX2_NxN 64, 32, sp FILTER_VER_LUMA_AVX2_NxN 64, 48, sp FILTER_VER_LUMA_AVX2_NxN 64, 64, sp FILTER_VER_LUMA_AVX2_NxN 16, 32, ss FILTER_VER_LUMA_AVX2_NxN 16, 64, ss FILTER_VER_LUMA_AVX2_NxN 24, 32, ss FILTER_VER_LUMA_AVX2_NxN 32, 32, ss FILTER_VER_LUMA_AVX2_NxN 32, 64, ss FILTER_VER_LUMA_AVX2_NxN 48, 64, ss FILTER_VER_LUMA_AVX2_NxN 64, 32, ss FILTER_VER_LUMA_AVX2_NxN 64, 48, ss FILTER_VER_LUMA_AVX2_NxN 64, 64, ss %macro FILTER_VER_LUMA_S_AVX2_12x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] PROCESS_LUMA_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 mova m7, m14 PROCESS_LUMA_AVX2_W4_16R %1 RET %endif %endmacro FILTER_VER_LUMA_S_AVX2_12x16 sp FILTER_VER_LUMA_S_AVX2_12x16 ss %macro FILTER_VER_LUMA_S_AVX2_16x12 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, 2 .loopW: movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] movu xm7, [r7 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 3 * mmsize] paddd m0, m8 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddwd m9, m7, [r5 + 3 * mmsize] paddd m1, m9 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] paddd m5, m9 pmaddwd m7, [r5] movu xm9, [r7 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 pmaddwd m10, m8, [r5 + 1 * mmsize] paddd m6, m10 pmaddwd m8, [r5] movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhwd xm11, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddwd m11, m9, [r5 + 3 * mmsize] paddd m3, m11 pmaddwd m11, m9, [r5 + 2 * mmsize] paddd m5, m11 pmaddwd m11, m9, [r5 + 1 * mmsize] paddd m7, m11 pmaddwd m9, [r5] movu xm11, [r7 + r4] ; m11 = row 11 punpckhwd xm12, xm10, xm11 punpcklwd xm10, xm11 vinserti128 m10, m10, xm12, 1 pmaddwd m12, m10, [r5 + 3 * mmsize] paddd m4, m12 pmaddwd m12, m10, [r5 + 2 * mmsize] paddd m6, m12 pmaddwd m12, m10, [r5 + 1 * mmsize] paddd m8, m12 pmaddwd m10, [r5] lea r7, [r7 + r1 * 4] movu xm12, [r7] ; m12 = row 12 punpckhwd xm13, xm11, xm12 punpcklwd xm11, xm12 vinserti128 m11, m11, xm13, 1 pmaddwd m13, m11, [r5 + 3 * mmsize] paddd m5, m13 pmaddwd m13, m11, [r5 + 2 * mmsize] paddd m7, m13 pmaddwd m13, m11, [r5 + 1 * mmsize] paddd m9, m13 pmaddwd m11, [r5] %ifidn %1,sp paddd m0, m14 paddd m1, m14 paddd m2, m14 paddd m3, m14 paddd m4, m14 paddd m5, m14 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 psrad m4, 12 psrad m5, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 psrad m4, 6 psrad m5, 6 %endif packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 %ifidn %1,sp packuswb m0, m2 mova m5, [interp8_hps_shuf] vpermd m0, m5, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif movu xm13, [r7 + r1] ; m13 = row 13 punpckhwd xm0, xm12, xm13 punpcklwd xm12, xm13 vinserti128 m12, m12, xm0, 1 pmaddwd m0, m12, [r5 + 3 * mmsize] paddd m6, m0 pmaddwd m0, m12, [r5 + 2 * mmsize] paddd m8, m0 pmaddwd m12, [r5 + 1 * mmsize] paddd m10, m12 movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhwd xm1, xm13, xm0 punpcklwd xm13, xm0 vinserti128 m13, m13, xm1, 1 pmaddwd m1, m13, [r5 + 3 * mmsize] paddd m7, m1 pmaddwd m1, m13, [r5 + 2 * mmsize] paddd m9, m1 pmaddwd m13, [r5 + 1 * mmsize] paddd m11, m13 %ifidn %1,sp paddd m6, m14 paddd m7, m14 psrad m6, 12 psrad m7, 12 %else psrad m6, 6 psrad m7, 6 %endif packssdw m6, m7 lea r8, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m5, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm1, m4, 1 vextracti128 xm7, m6, 1 movu [r8], xm4 movu [r8 + r3], xm1 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %endif movu xm1, [r7 + r4] ; m1 = row 15 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m2, m0, [r5 + 3 * mmsize] paddd m8, m2 pmaddwd m0, [r5 + 2 * mmsize] paddd m10, m0 lea r7, [r7 + r1 * 4] movu xm2, [r7] ; m2 = row 16 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m3, m1, [r5 + 3 * mmsize] paddd m9, m3 pmaddwd m1, [r5 + 2 * mmsize] paddd m11, m1 movu xm3, [r7 + r1] ; m3 = row 17 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m2, [r5 + 3 * mmsize] paddd m10, m2 movu xm4, [r7 + r1 * 2] ; m4 = row 18 punpckhwd xm2, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm2, 1 pmaddwd m3, [r5 + 3 * mmsize] paddd m11, m3 %ifidn %1,sp paddd m8, m14 paddd m9, m14 paddd m10, m14 paddd m11, m14 psrad m8, 12 psrad m9, 12 psrad m10, 12 psrad m11, 12 %else psrad m8, 6 psrad m9, 6 psrad m10, 6 psrad m11, 6 %endif packssdw m8, m9 packssdw m10, m11 lea r8, [r8 + r3 * 4] %ifidn %1,sp packuswb m8, m10 vpermd m8, m5, m8 vextracti128 xm10, m8, 1 movq [r8], xm8 movhps [r8 + r3], xm8 movq [r8 + r3 * 2], xm10 movhps [r8 + r6], xm10 add r2, 8 %else vpermq m8, m8, 11011000b vpermq m10, m10, 11011000b vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 movu [r8], xm8 movu [r8 + r3], xm9 movu [r8 + r3 * 2], xm10 movu [r8 + r6], xm11 add r2, 16 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_S_AVX2_16x12 sp FILTER_VER_LUMA_S_AVX2_16x12 ss %macro FILTER_VER_LUMA_S_AVX2_16x4 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m7, [pd_526336] %else add r3d, r3d %endif mov dword [rsp], 2 .loopW: PROCESS_LUMA_S_AVX2_W8_4R %1 lea r6, [r3 * 3] %ifidn %1,sp movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 add r2, 8 %else movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 add r2, 16 %endif lea r6, [8 * r1 - 16] sub r0, r6 dec dword [rsp] jnz .loopW RET %endmacro FILTER_VER_LUMA_S_AVX2_16x4 sp FILTER_VER_LUMA_S_AVX2_16x4 ss %macro PROCESS_LUMA_S_AVX2_W8_8R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 punpckhwd xm2, xm0, xm1 punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 punpckhwd xm3, xm1, xm2 punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 punpckhwd xm4, xm2, xm3 punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 pmaddwd m4, m2, [r5 + 1 * mmsize] paddd m0, m4 pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 punpckhwd xm5, xm3, xm4 punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 pmaddwd m5, m3, [r5 + 1 * mmsize] paddd m1, m5 pmaddwd m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 punpckhwd xm6, xm4, xm5 punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 pmaddwd m6, m4, [r5 + 2 * mmsize] paddd m0, m6 pmaddwd m6, m4, [r5 + 1 * mmsize] paddd m2, m6 pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 punpckhwd xm7, xm5, xm6 punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 pmaddwd m7, m5, [r5 + 2 * mmsize] paddd m1, m7 pmaddwd m7, m5, [r5 + 1 * mmsize] paddd m3, m7 pmaddwd m5, [r5] movu xm7, [r7 + r4] ; m7 = row 7 punpckhwd xm8, xm6, xm7 punpcklwd xm6, xm7 vinserti128 m6, m6, xm8, 1 pmaddwd m8, m6, [r5 + 3 * mmsize] paddd m0, m8 pmaddwd m8, m6, [r5 + 2 * mmsize] paddd m2, m8 pmaddwd m8, m6, [r5 + 1 * mmsize] paddd m4, m8 pmaddwd m6, [r5] lea r7, [r7 + r1 * 4] movu xm8, [r7] ; m8 = row 8 punpckhwd xm9, xm7, xm8 punpcklwd xm7, xm8 vinserti128 m7, m7, xm9, 1 pmaddwd m9, m7, [r5 + 3 * mmsize] paddd m1, m9 pmaddwd m9, m7, [r5 + 2 * mmsize] paddd m3, m9 pmaddwd m9, m7, [r5 + 1 * mmsize] paddd m5, m9 pmaddwd m7, [r5] movu xm9, [r7 + r1] ; m9 = row 9 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m2, m10 pmaddwd m10, m8, [r5 + 2 * mmsize] paddd m4, m10 pmaddwd m8, [r5 + 1 * mmsize] paddd m6, m8 movu xm10, [r7 + r1 * 2] ; m10 = row 10 punpckhwd xm8, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm8, 1 pmaddwd m8, m9, [r5 + 3 * mmsize] paddd m3, m8 pmaddwd m8, m9, [r5 + 2 * mmsize] paddd m5, m8 pmaddwd m9, [r5 + 1 * mmsize] paddd m7, m9 movu xm8, [r7 + r4] ; m8 = row 11 punpckhwd xm9, xm10, xm8 punpcklwd xm10, xm8 vinserti128 m10, m10, xm9, 1 pmaddwd m9, m10, [r5 + 3 * mmsize] paddd m4, m9 pmaddwd m10, [r5 + 2 * mmsize] paddd m6, m10 lea r7, [r7 + r1 * 4] movu xm9, [r7] ; m9 = row 12 punpckhwd xm10, xm8, xm9 punpcklwd xm8, xm9 vinserti128 m8, m8, xm10, 1 pmaddwd m10, m8, [r5 + 3 * mmsize] paddd m5, m10 pmaddwd m8, [r5 + 2 * mmsize] paddd m7, m8 %ifidn %1,sp paddd m0, m11 paddd m1, m11 paddd m2, m11 paddd m3, m11 paddd m4, m11 paddd m5, m11 psrad m0, 12 psrad m1, 12 psrad m2, 12 psrad m3, 12 psrad m4, 12 psrad m5, 12 %else psrad m0, 6 psrad m1, 6 psrad m2, 6 psrad m3, 6 psrad m4, 6 psrad m5, 6 %endif packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 %ifidn %1,sp packuswb m0, m2 mova m5, [interp8_hps_shuf] vpermd m0, m5, m0 vextracti128 xm2, m0, 1 movq [r2], xm0 movhps [r2 + r3], xm0 movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 %else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 %endif movu xm10, [r7 + r1] ; m10 = row 13 punpckhwd xm0, xm9, xm10 punpcklwd xm9, xm10 vinserti128 m9, m9, xm0, 1 pmaddwd m9, [r5 + 3 * mmsize] paddd m6, m9 movu xm0, [r7 + r1 * 2] ; m0 = row 14 punpckhwd xm1, xm10, xm0 punpcklwd xm10, xm0 vinserti128 m10, m10, xm1, 1 pmaddwd m10, [r5 + 3 * mmsize] paddd m7, m10 %ifidn %1,sp paddd m6, m11 paddd m7, m11 psrad m6, 12 psrad m7, 12 %else psrad m6, 6 psrad m7, 6 %endif packssdw m6, m7 lea r8, [r2 + r3 * 4] %ifidn %1,sp packuswb m4, m6 vpermd m4, m5, m4 vextracti128 xm6, m4, 1 movq [r8], xm4 movhps [r8 + r3], xm4 movq [r8 + r3 * 2], xm6 movhps [r8 + r6], xm6 %else vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 movu [r8], xm4 movu [r8 + r3], xm5 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 %endif %endmacro %macro FILTER_VER_LUMA_AVX2_Nx8 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m11, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, %2 / 8 .loopW: PROCESS_LUMA_S_AVX2_W8_8R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loopW RET %endif %endmacro FILTER_VER_LUMA_AVX2_Nx8 sp, 32 FILTER_VER_LUMA_AVX2_Nx8 sp, 16 FILTER_VER_LUMA_AVX2_Nx8 ss, 32 FILTER_VER_LUMA_AVX2_Nx8 ss, 16 %macro FILTER_VER_LUMA_S_AVX2_32x24 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 mov r4d, r4m shl r4d, 7 add r1d, r1d %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp mova m14, [pd_526336] %else add r3d, r3d %endif lea r6, [r3 * 3] mov r9d, 4 .loopW: PROCESS_LUMA_AVX2_W8_16R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loopW lea r9, [r1 * 4] sub r7, r9 lea r0, [r7 - 48] %ifidn %1,sp lea r2, [r8 + r3 * 4 - 24] %else lea r2, [r8 + r3 * 4 - 48] %endif mova m11, m14 mov r9d, 4 .loop: PROCESS_LUMA_S_AVX2_W8_8R %1 %ifidn %1,sp add r2, 8 %else add r2, 16 %endif add r0, 16 dec r9d jnz .loop RET %endif %endmacro FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; INIT_YMM avx2 cglobal interp_4tap_horiz_ps_32x32, 4,6,8 mov r4d, r4m add r3d, r3d dec r0 ; check isRowExt cmp r5m, byte 0 lea r5, [tab_ChromaCoeff] vpbroadcastw m0, [r5 + r4 * 4 + 0] vpbroadcastw m1, [r5 + r4 * 4 + 2] mova m7, [pw_2000] ; register map ; m0 - interpolate coeff Low ; m1 - interpolate coeff High ; m7 - constant pw_2000 mov r4d, 32 je .loop sub r0, r1 add r4d, 3 .loop ; Row 0 movu m2, [r0] movu m3, [r0 + 1] punpckhbw m4, m2, m3 punpcklbw m2, m3 pmaddubsw m4, m0 pmaddubsw m2, m0 movu m3, [r0 + 2] movu m5, [r0 + 3] punpckhbw m6, m3, m5 punpcklbw m3, m5 pmaddubsw m6, m1 pmaddubsw m3, m1 paddw m4, m6 paddw m2, m3 psubw m4, m7 psubw m2, m7 vperm2i128 m3, m2, m4, 0x20 vperm2i128 m5, m2, m4, 0x31 movu [r2], m3 movu [r2 + mmsize], m5 add r2, r3 add r0, r1 dec r4d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; INIT_YMM avx2 cglobal interp_4tap_horiz_ps_16x16, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 16 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], m3 add r2, r3 add r0, r1 dec r6d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PS_16xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, %2 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], m3 add r2, r3 add r0, r1 dec r6d jnz .loop RET %endmacro IPFILTER_CHROMA_PS_16xN_AVX2 16 , 32 IPFILTER_CHROMA_PS_16xN_AVX2 16 , 12 IPFILTER_CHROMA_PS_16xN_AVX2 16 , 8 IPFILTER_CHROMA_PS_16xN_AVX2 16 , 4 IPFILTER_CHROMA_PS_16xN_AVX2 16 , 24 IPFILTER_CHROMA_PS_16xN_AVX2 16 , 64 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PS_32xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, %2 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], m3 vbroadcasti128 m3, [r0 + 16] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 24] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2 + 32], m3 add r2, r3 add r0, r1 dec r6d jnz .loop RET %endmacro IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16 IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24 IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8 IPFILTER_CHROMA_PS_32xN_AVX2 32 , 64 IPFILTER_CHROMA_PS_32xN_AVX2 32 , 48 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal interp_4tap_horiz_ps_4x4, 4,7,5 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 test r5d, r5d je .label sub r0 , r1 .label ; Row 0-1 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 2-3 lea r0, [r0 + r1 * 2] movu xm4, [r0] vinserti128 m4, m4, [r0 + r1], 1 pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 movhps [r2 + r3], xm4 test r5d, r5d jz .end lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] ;Row 5-6 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 7 lea r0, [r0 + r1 * 2] vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 .end RET cglobal interp_4tap_horiz_ps_4x2, 4,7,5 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 test r5d, r5d je .label sub r0 , r1 .label ; Row 0-1 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 test r5d, r5d jz .end lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] ;Row 2-3 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 5 lea r0, [r0 + r1 * 2] vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 .end RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; %macro IPFILTER_CHROMA_PS_4xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r4, %2 dec r0 test r5d, r5d je .loop sub r0 , r1 .loop sub r4d, 4 ; Row 0-1 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 2-3 lea r0, [r0 + r1 * 2] movu xm4, [r0] vinserti128 m4, m4, [r0 + r1], 1 pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 movhps [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] test r4d, r4d jnz .loop test r5d, r5d jz .end ;Row 5-6 movu xm3, [r0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 7 lea r0, [r0 + r1 * 2] vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, [pw_2000] vextracti128 xm4, m3, 1 movq [r2], xm3 movq [r2+r3], xm4 lea r2, [r2 + r3 * 2] movhps [r2], xm3 .end RET %endmacro IPFILTER_CHROMA_PS_4xN_AVX2 4 , 8 IPFILTER_CHROMA_PS_4xN_AVX2 4 , 16 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_8x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; INIT_YMM avx2 cglobal interp_4tap_horiz_ps_8x8, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 4 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 1 .loop dec r6d ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movu [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] test r6d, r6d jnz .loop test r5d, r5d je .end ;Row 11 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], xm3 .end RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_4x2, 4,6,4 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; Row 0-1 movu xm2, [r0 - 1] vinserti128 m2, m2, [r0 + r1 - 1], 1 pshufb m2, m1 pmaddubsw m2, m0 pmaddwd m2, [pw_1] packssdw m2, m2 pmulhrsw m2, [pw_512] vextracti128 xm3, m2, 1 packuswb xm2, xm3 movd [r2], xm2 pextrd [r2+r3], xm2, 2 RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PP_32xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_%1x%2, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, %2 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b movu [r2], m3 add r2, r3 add r0, r1 dec r4d jnz .loop RET %endmacro IPFILTER_CHROMA_PP_32xN_AVX2 32, 16 IPFILTER_CHROMA_PP_32xN_AVX2 32, 24 IPFILTER_CHROMA_PP_32xN_AVX2 32, 8 IPFILTER_CHROMA_PP_32xN_AVX2 32, 64 IPFILTER_CHROMA_PP_32xN_AVX2 32, 48 ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PP_8xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif movu m1, [tab_Tm] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 sub r0, 1 mov r4d, %2 .loop: sub r4d, 4 ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, [pw_512] lea r0, [r0 + r1 * 2] ; Row 2 vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 ; Row 3 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, [pw_512] packuswb m3, m4 mova m5, [interp_4tap_8x8_horiz_shuf] vpermd m3, m5, m3 vextracti128 xm4, m3, 1 movq [r2], xm3 movhps [r2 + r3], xm3 lea r2, [r2 + r3 * 2] movq [r2], xm4 movhps [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1*2] test r4d, r4d jnz .loop RET %endmacro IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16 IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32 IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4 IPFILTER_CHROMA_PP_8xN_AVX2 8 , 64 IPFILTER_CHROMA_PP_8xN_AVX2 8 , 12 ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PP_4xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vpbroadcastd m2, [pw_1] vbroadcasti128 m1, [tab_Tm] mov r4d, %2 ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 .loop sub r4d, 4 ; Row 0-1 movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] vinserti128 m3, m3, [r0 + r1], 1 pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 2-3 lea r0, [r0 + r1 * 2] movu xm4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] vinserti128 m4, m4, [r0 + r1], 1 pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, [pw_512] vextracti128 xm4, m3, 1 packuswb xm3, xm4 movd [r2], xm3 pextrd [r2+r3], xm3, 2 lea r2, [r2 + r3 * 2] pextrd [r2], xm3, 1 pextrd [r2+r3], xm3, 3 lea r0, [r0 + r1 * 2] lea r2, [r2 + r3 * 2] test r4d, r4d jnz .loop RET %endmacro IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8 IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16 %macro IPFILTER_LUMA_PS_32xN_AVX2 2 INIT_YMM avx2 cglobal interp_8tap_horiz_ps_%1x%2, 4, 7, 8 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r4d, %2 ;height add r3d, r3d vbroadcasti128 m2, [pw_1] mova m7, [interp8_hps_shuf] ; register map ; m0 - interpolate coeff ; m1 , m6 - shuffle order table ; m2 - pw_1 sub r0, 3 test r5d, r5d jz .label lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride sub r0, r6 add r4d, 7 .label lea r6, [pw_2000] .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 8] pshufb m5, m4, m6 ;row 0 (col 12 to 15) pshufb m4, m1 ;row 0 (col 8 to 11) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2], m3 ;row 0 vbroadcasti128 m3, [r0 + 16] pshufb m4, m3, m6 ; row 0 (col 20 to 23) pshufb m3, m1 ; row 0 (col 16 to 19) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 24] pshufb m5, m4, m6 ;row 0 (col 28 to 31) pshufb m4, m1 ;row 0 (col 24 to 27) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2 + 32], m3 ;row 0 add r0, r1 add r2, r3 dec r4d jnz .loop RET %endmacro IPFILTER_LUMA_PS_32xN_AVX2 32 , 32 IPFILTER_LUMA_PS_32xN_AVX2 32 , 16 IPFILTER_LUMA_PS_32xN_AVX2 32 , 24 IPFILTER_LUMA_PS_32xN_AVX2 32 , 8 IPFILTER_LUMA_PS_32xN_AVX2 32 , 64 INIT_YMM avx2 cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r4d, 64 ;height add r3d, r3d vbroadcasti128 m2, [pw_2000] mova m7, [pw_1] ; register map ; m0 - interpolate coeff ; m1 , m6 - shuffle order table ; m2 - pw_2000 sub r0, 3 test r5d, r5d jz .label lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) .label lea r6, [interp8_hps_shuf] .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vbroadcasti128 m4, [r0 + 8] pshufb m5, m4, m6 ;row 0 (col 12 to 15) pshufb m4, m1 ;row 0 (col 8 to 11) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m7 pmaddwd m5, m7 packssdw m4, m5 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 mova m5, [r6] vpermd m3, m5, m3 psubw m3, m2 movu [r2], m3 ;row 0 vbroadcasti128 m3, [r0 + 16] pshufb m4, m3, m6 ; row 0 (col 20 to 23) pshufb m3, m1 ; row 0 (col 16 to 19) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vbroadcasti128 m4, [r0 + 24] pshufb m5, m4, m6 ;row 0 (col 28 to 31) pshufb m4, m1 ;row 0 (col 24 to 27) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m7 pmaddwd m5, m7 packssdw m4, m5 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 mova m5, [r6] vpermd m3, m5, m3 psubw m3, m2 movu [r2 + 32], m3 ;row 0 vbroadcasti128 m3, [r0 + 32] pshufb m4, m3, m6 ; row 0 (col 36 to 39) pshufb m3, m1 ; row 0 (col 32 to 35) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vbroadcasti128 m4, [r0 + 40] pshufb m5, m4, m6 ;row 0 (col 44 to 47) pshufb m4, m1 ;row 0 (col 40 to 43) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m7 pmaddwd m5, m7 packssdw m4, m5 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 mova m5, [r6] vpermd m3, m5, m3 psubw m3, m2 movu [r2 + 64], m3 ;row 0 add r0, r1 add r2, r3 dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_8tap_horiz_pp_24x32, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] lea r5, [tab_Tm] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, 32 .loop: ; Row 0 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m3 pshufb m4, [r5] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] pshufb m6, m5, m3 pshufb m5, [r5] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] vbroadcasti128 m2, [r0 + 16] pshufb m5, m2, m3 pshufb m2, [r5] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 packssdw m2, m2 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movu [r2], xm4 movq [r2 + 16], xm5 add r0, r1 add r2, r3 dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_8tap_horiz_pp_12x16, 4,6,8 sub r0, 3 mov r4d, r4m %ifdef PIC lea r5, [tab_LumaCoeff] vpbroadcastd m0, [r5 + r4 * 8] vpbroadcastd m1, [r5 + r4 * 8 + 4] %else vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif movu m3, [tab_Tm + 16] vpbroadcastd m7, [pw_1] lea r5, [tab_Tm] ; register map ; m0 , m1 interpolate coeff ; m2 , m2 shuffle order table ; m7 - pw_1 mov r4d, 8 .loop: ; Row 0 vbroadcasti128 m4, [r0] ;first 8 element pshufb m5, m4, m3 pshufb m4, [r5] pmaddubsw m4, m0 pmaddubsw m5, m1 paddw m4, m5 pmaddwd m4, m7 vbroadcasti128 m5, [r0 + 8] ; element 8 to 11 pshufb m6, m5, m3 pshufb m5, [r5] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] pmulhrsw m4, [pw_512] ;Row 1 vbroadcasti128 m2, [r0 + r1] pshufb m5, m2, m3 pshufb m2, [r5] pmaddubsw m2, m0 pmaddubsw m5, m1 paddw m2, m5 pmaddwd m2, m7 vbroadcasti128 m5, [r0 + r1 + 8] pshufb m6, m5, m3 pshufb m5, [r5] pmaddubsw m5, m0 pmaddubsw m6, m1 paddw m5, m6 pmaddwd m5, m7 packssdw m2, m5 pmulhrsw m2, [pw_512] packuswb m4, m2 vpermq m4, m4, 11011000b vextracti128 xm5, m4, 1 pshufd xm4, xm4, 11011000b pshufd xm5, xm5, 11011000b movq [r2], xm4 pextrd [r2+8], xm4, 2 movq [r2 + r3], xm5 pextrd [r2+r3+8], xm5, 2 lea r0, [r0 + r1 * 2] lea r2, [r2 + r3 * 2] dec r4d jnz .loop RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PP_16xN_AVX2 2 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m6, [pw_512] mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, %2/2 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movu [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r4d jnz .loop RET %endmacro IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8 IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32 IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12 IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4 IPFILTER_CHROMA_PP_16xN_AVX2 16 , 64 IPFILTER_CHROMA_PP_16xN_AVX2 16 , 24 %macro IPFILTER_LUMA_PS_64xN_AVX2 1 INIT_YMM avx2 cglobal interp_8tap_horiz_ps_64x%1, 4, 7, 8 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r4d, %1 ;height add r3d, r3d vbroadcasti128 m2, [pw_1] mova m7, [interp8_hps_shuf] ; register map ; m0 - interpolate coeff ; m1 , m6 - shuffle order table ; m2 - pw_2000 sub r0, 3 test r5d, r5d jz .label lea r6, [r1 * 3] sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 .label lea r6, [pw_2000] .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 8] pshufb m5, m4, m6 ;row 0 (col 12 to 15) pshufb m4, m1 ;row 0 (col 8 to 11) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2], m3 ;row 0 vbroadcasti128 m3, [r0 + 16] pshufb m4, m3, m6 ; row 0 (col 20 to 23) pshufb m3, m1 ; row 0 (col 16 to 19) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 24] pshufb m5, m4, m6 ;row 0 (col 28 to 31) pshufb m4, m1 ;row 0 (col 24 to 27) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2 + 32], m3 ;row 0 vbroadcasti128 m3, [r0 + 32] pshufb m4, m3, m6 ; row 0 (col 36 to 39) pshufb m3, m1 ; row 0 (col 32 to 35) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 40] pshufb m5, m4, m6 ;row 0 (col 44 to 47) pshufb m4, m1 ;row 0 (col 40 to 43) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2 + 64], m3 ;row 0 vbroadcasti128 m3, [r0 + 48] pshufb m4, m3, m6 ; row 0 (col 52 to 55) pshufb m3, m1 ; row 0 (col 48 to 51) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vbroadcasti128 m4, [r0 + 56] pshufb m5, m4, m6 ;row 0 (col 60 to 63) pshufb m4, m1 ;row 0 (col 56 to 59) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 vpermd m3, m7, m3 psubw m3, [r6] movu [r2 + 96], m3 ;row 0 add r0, r1 add r2, r3 dec r4d jnz .loop RET %endmacro IPFILTER_LUMA_PS_64xN_AVX2 64 IPFILTER_LUMA_PS_64xN_AVX2 48 IPFILTER_LUMA_PS_64xN_AVX2 32 IPFILTER_LUMA_PS_64xN_AVX2 16 ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PS_8xN_AVX2 1 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, %1/2 dec r0 test r5d, r5d jz .loop sub r0 , r1 inc r6d .loop ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movu [r2 + r3], xm4 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r6d jnz .loop test r5d, r5d jz .end ;Row 11 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], xm3 .end RET %endmacro IPFILTER_CHROMA_PS_8xN_AVX2 2 IPFILTER_CHROMA_PS_8xN_AVX2 32 IPFILTER_CHROMA_PS_8xN_AVX2 16 IPFILTER_CHROMA_PS_8xN_AVX2 6 IPFILTER_CHROMA_PS_8xN_AVX2 4 IPFILTER_CHROMA_PS_8xN_AVX2 12 IPFILTER_CHROMA_PS_8xN_AVX2 64 INIT_YMM avx2 cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova xm3, [pw_2000] dec r0 test r5d, r5d jz .label sub r0, r1 .label lea r6, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r6] vinserti128 m1, m1, xm2, 1 pshufb m1, [interp4_hpp_shuf] pmaddubsw m1, m0 pmaddwd m1, [pw_1] vextracti128 xm2, m1, 1 packssdw xm1, xm2 psubw xm1, xm3 lea r4, [r3 * 3] movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 pextrd [r2 + r4], xm1, 3 test r5d, r5d jz .end lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] vinserti128 m1, m1, xm2, 1 pshufb m1, [interp4_hpp_shuf] pmaddubsw m1, m0 pmaddwd m1, [pw_1] vextracti128 xm2, m1, 1 packssdw xm1, xm2 psubw xm1, xm3 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 pextrd [r2 + r3 * 2], xm1, 2 .end RET INIT_YMM avx2 cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m6, [pw_2000] test r5d, r5d jz .label sub r0, r1 .label mova m4, [interp4_hpp_shuf] mova m5, [pw_1] dec r0 lea r4, [r1 * 3] movq xm1, [r0] ;row 0 movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 psubw m1, m6 lea r4, [r3 * 3] vextracti128 xm2, m1, 1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 pextrd [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm1, 2 pextrd [r2 + r3], xm1, 3 pextrd [r2 + r3 * 2], xm2, 2 pextrd [r2 + r4], xm2, 3 test r5d, r5d jz .end lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] movq xm1, [r0] ;row 0 movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] vinserti128 m1, m1, xm2, 1 pshufb m1, m4 pmaddubsw m1, m0 pmaddwd m1, m5 packssdw m1, m1 psubw m1, m6 vextracti128 xm2, m1, 1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 .end RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_12x16, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m6, [pw_512] mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 8 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movq [r2], xm3 pextrd [r2+8], xm3, 2 movq [r2 + r3], xm4 pextrd [r2 + r3 + 8],xm4, 2 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_24x32, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 32 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movq [r2 + 16], xm4 add r2, r3 add r0, r1 dec r4d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; INIT_YMM avx2 cglobal interp_4tap_horiz_ps_6x8, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 8/2 dec r0 test r5d, r5d jz .loop sub r0 , r1 inc r6d .loop ; Row 0 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movq [r2], xm3 pextrd [r2 + 8], xm3, 2 movq [r2 + r3], xm4 pextrd [r2 + r3 + 8], xm4, 2 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r6d jnz .loop test r5d, r5d jz .end ;Row 11 vbroadcasti128 m3, [r0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, m5 vextracti128 xm4, m3, 1 movq [r2], xm3 movd [r2+8], xm4 .end RET INIT_YMM avx2 cglobal interp_8tap_horiz_ps_12x16, 6, 7, 8 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] add r3d, r3d vbroadcasti128 m2, [pw_2000] mov r4d, 16 vbroadcasti128 m7, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - pw_2000 mova m5, [interp8_hps_shuf] sub r0, 3 test r5d, r5d jz .loop lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride sub r0, r6 ; r0(src)-r6 add r4d, 7 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 pshufb m3, m1 ; shuffled based on the col order tab_Lm pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m7 packssdw m4, m4 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vpermd m3, m5, m3 psubw m3, m2 vextracti128 xm4, m3, 1 movu [r2], xm3 ;row 0 movq [r2 + 16], xm4 ;row 1 add r0, r1 add r2, r3 dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_8tap_horiz_ps_24x32, 4, 7, 8 mov r5d, r5m mov r4d, r4m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r4d, 32 ;height add r3d, r3d vbroadcasti128 m2, [pw_2000] vbroadcasti128 m7, [pw_1] ; register map ; m0 - interpolate coeff ; m1 , m6 - shuffle order table ; m2 - pw_2000 sub r0, 3 test r5d, r5d jz .label lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride sub r0, r6 ; r0(src)-r6 add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) .label lea r6, [interp8_hps_shuf] .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m6 ;row 1 (col 4 to 7) pshufb m4, m1 ;row 1 (col 0 to 3) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m7 pmaddwd m5, m7 packssdw m4, m5 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 mova m5, [r6] vpermd m3, m5, m3 psubw m3, m2 movu [r2], m3 ;row 0 vbroadcasti128 m3, [r0 + 16] pshufb m4, m3, m6 pshufb m3, m1 pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 pmaddwd m3, m7 pmaddwd m4, m7 packssdw m3, m4 mova m4, [r6] vpermd m3, m4, m3 psubw m3, m2 movu [r2 + 32], xm3 ;row 0 add r0, r1 add r2, r3 dec r4d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal interp_4tap_horiz_ps_24x32, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 32 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2], m3 vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, m5 vpermq m3, m3, 11011000b movu [r2 + 32], xm3 add r2, r3 add r0, r1 dec r6d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------- ;macro FILTER_H8_W8_16N_AVX2 ;----------------------------------------------------------------------------------------------------------------------- %macro FILTER_H8_W8_16N_AVX2 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m3, m6 ; row 0 (col 4 to 7) pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m4, m6 ;row 1 (col 4 to 7) pshufb m4, m1 ;row 1 (col 0 to 3) pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddwd m4, m2 pmaddwd m5, m2 packssdw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] pmaddwd m3, m2 pmaddwd m4, m2 packssdw m3, m4 ; all rows and col completed. mova m5, [interp8_hps_shuf] vpermd m3, m5, m3 psubw m3, m8 vextracti128 xm4, m3, 1 mova [r4], xm3 mova [r4 + 16], xm4 %endmacro ;----------------------------------------------------------------------------- ; void interp_8tap_hv_pp_16x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) ;----------------------------------------------------------------------------- INIT_YMM avx2 %if ARCH_X86_64 == 1 cglobal interp_8tap_hv_pp_16x16, 4, 10, 15, 0-31*32 %define stk_buf1 rsp mov r4d, r4m mov r5d, r5m %ifdef PIC lea r6, [tab_LumaCoeff] vpbroadcastq m0, [r6 + r4 * 8] %else vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] %endif xor r6, r6 mov r4, rsp mova m6, [tab_Lm + 32] mova m1, [tab_Lm] mov r8, 16 ;height vbroadcasti128 m8, [pw_2000] vbroadcasti128 m2, [pw_1] sub r0, 3 lea r7, [r1 * 3] ; r7 = (N / 2 - 1) * srcStride sub r0, r7 ; r0(src)-r7 add r8, 7 .loopH: FILTER_H8_W8_16N_AVX2 add r0, r1 add r4, 32 inc r6 cmp r6, 16+7 jnz .loopH ; vertical phase xor r6, r6 xor r1, r1 .loopV: ;load necessary variables mov r4d, r5d ;coeff here for vertical is r5m shl r4d, 7 mov r1d, 16 add r1d, r1d ; load intermedia buffer mov r0, stk_buf1 ; register mapping ; r0 - src ; r5 - coeff ; r6 - loop_i ; load coeff table %ifdef PIC lea r5, [pw_LumaCoeffVer] add r5, r4 %else lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1*3] mova m14, [pd_526336] lea r6, [r3 * 3] mov r9d, 16 / 8 .loopW: PROCESS_LUMA_AVX2_W8_16R sp add r2, 8 add r0, 16 dec r9d jnz .loopW RET %endif INIT_YMM avx2 cglobal interp_4tap_horiz_pp_12x32, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m6, [pw_512] mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 16 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movq [r2], xm3 pextrd [r2+8], xm3, 2 movq [r2 + r3], xm4 pextrd [r2 + r3 + 8],xm4, 2 lea r2, [r2 + r3 * 2] lea r0, [r0 + r1 * 2] dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_24x64, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 64 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b vextracti128 xm4, m3, 1 movu [r2], xm3 movq [r2 + 16], xm4 add r2, r3 add r0, r1 dec r4d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m4, [interp4_hpp_shuf] mova m5, [pw_1] dec r0 lea r4, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 pmulhrsw m1, [pw_512] vextracti128 xm2, m1, 1 packuswb xm1, xm2 lea r4, [r3 * 3] pextrw [r2], xm1, 0 pextrw [r2 + r3], xm1, 1 pextrw [r2 + r3 * 2], xm1, 4 pextrw [r2 + r4], xm1, 5 lea r2, [r2 + r3 * 4] pextrw [r2], xm1, 2 pextrw [r2 + r3], xm1, 3 pextrw [r2 + r3 * 2], xm1, 6 pextrw [r2 + r4], xm1, 7 lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] lea r4, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 pmulhrsw m1, [pw_512] vextracti128 xm2, m1, 1 packuswb xm1, xm2 lea r4, [r3 * 3] pextrw [r2], xm1, 0 pextrw [r2 + r3], xm1, 1 pextrw [r2 + r3 * 2], xm1, 4 pextrw [r2 + r4], xm1, 5 lea r2, [r2 + r3 * 4] pextrw [r2], xm1, 2 pextrw [r2 + r3], xm1, 3 pextrw [r2 + r3 * 2], xm1, 6 pextrw [r2 + r4], xm1, 7 RET ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- %macro IPFILTER_CHROMA_PP_64xN_AVX2 1 INIT_YMM avx2 cglobal interp_4tap_horiz_pp_64x%1, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, %1 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b movu [r2], m3 vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 36] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 48] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 52] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, 11011000b movu [r2 + 32], m3 add r2, r3 add r0, r1 dec r4d jnz .loop RET %endmacro IPFILTER_CHROMA_PP_64xN_AVX2 64 IPFILTER_CHROMA_PP_64xN_AVX2 32 IPFILTER_CHROMA_PP_64xN_AVX2 48 IPFILTER_CHROMA_PP_64xN_AVX2 16 ;------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_pp_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal interp_4tap_horiz_pp_48x64, 4,6,7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [interp4_horiz_shuf1] vpbroadcastd m2, [pw_1] mova m6, [pw_512] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 mov r4d, 64 .loop: ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, q3120 movu [r2], m3 vbroadcasti128 m3, [r0 + mmsize] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + mmsize + 4] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 vbroadcasti128 m4, [r0 + mmsize + 16] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 vbroadcasti128 m5, [r0 + mmsize + 20] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vpermq m3, m3, q3120 movu [r2 + mmsize], xm3 add r2, r3 add r0, r1 dec r4d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_48x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;-----------------------------------------------------------------------------------------------------------------------------; INIT_YMM avx2 cglobal interp_4tap_horiz_ps_48x64, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 64 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, q3120 movu [r2], m3 vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, q3120 movu [r2 + 32], m3 vbroadcasti128 m3, [r0 + 32] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 40] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, q3120 movu [r2 + 64], m3 add r2, r3 add r0, r1 dec r6d jnz .loop RET ;----------------------------------------------------------------------------------------------------------------------------- ; void interp_4tap_horiz_ps_24x64(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;----------------------------------------------------------------------------------------------------------------------------- INIT_YMM avx2 cglobal interp_4tap_horiz_ps_24x64, 4,7,6 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m2, [pw_1] vbroadcasti128 m5, [pw_2000] mova m1, [tab_Tm] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 mov r6d, 64 dec r0 test r5d, r5d je .loop sub r0 , r1 add r6d , 3 .loop ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 psubw m3, m5 vpermq m3, m3, q3120 movu [r2], m3 vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 packssdw m3, m3 psubw m3, m5 vpermq m3, m3, q3120 movu [r2 + 32], xm3 add r2, r3 add r0, r1 dec r6d jnz .loop RET INIT_YMM avx2 cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7 mov r4d, r4m mov r5d, r5m add r3d, r3d %ifdef PIC lea r6, [tab_ChromaCoeff] vpbroadcastd m0, [r6 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif vbroadcasti128 m6, [pw_2000] test r5d, r5d jz .label sub r0, r1 .label mova m4, [interp4_hps_shuf] mova m5, [pw_1] dec r0 lea r4, [r1 * 3] movq xm1, [r0] ;row 0 movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 psubw m1, m6 lea r4, [r3 * 3] vextracti128 xm2, m1, 1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 pextrd [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm1, 2 pextrd [r2 + r3], xm1, 3 pextrd [r2 + r3 * 2], xm2, 2 pextrd [r2 + r4], xm2, 3 lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] lea r4, [r1 * 3] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m1, m1, xm2, 1 lea r0, [r0 + r1 * 4] movq xm3, [r0] movhps xm3, [r0 + r1] movq xm2, [r0 + r1 * 2] movhps xm2, [r0 + r4] vinserti128 m3, m3, xm2, 1 pshufb m1, m4 pshufb m3, m4 pmaddubsw m1, m0 pmaddubsw m3, m0 pmaddwd m1, m5 pmaddwd m3, m5 packssdw m1, m3 psubw m1, m6 lea r4, [r3 * 3] vextracti128 xm2, m1, 1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 pextrd [r2 + r4], xm2, 1 lea r2, [r2 + r3 * 4] pextrd [r2], xm1, 2 pextrd [r2 + r3], xm1, 3 pextrd [r2 + r3 * 2], xm2, 2 pextrd [r2 + r4], xm2, 3 test r5d, r5d jz .end lea r0, [r0 + r1 * 4] lea r2, [r2 + r3 * 4] movq xm1, [r0] movhps xm1, [r0 + r1] movq xm2, [r0 + r1 * 2] vinserti128 m1, m1, xm2, 1 pshufb m1, m4 pmaddubsw m1, m0 pmaddwd m1, m5 packssdw m1, m1 psubw m1, m6 vextracti128 xm2, m1, 1 movd [r2], xm1 pextrd [r2 + r3], xm1, 1 movd [r2 + r3 * 2], xm2 .end RET INIT_YMM avx2 cglobal interp_4tap_horiz_pp_6x16, 4, 6, 7 mov r4d, r4m %ifdef PIC lea r5, [tab_ChromaCoeff] vpbroadcastd m0, [r5 + r4 * 4] %else vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif mova m1, [tab_Tm] mova m2, [pw_1] mova m6, [pw_512] lea r4, [r1 * 3] lea r5, [r3 * 3] ; register map ; m0 - interpolate coeff ; m1 - shuffle order table ; m2 - constant word 1 dec r0 %rep 4 ; Row 0 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m3, m1 pmaddubsw m3, m0 pmaddwd m3, m2 ; Row 1 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 packssdw m3, m4 pmulhrsw m3, m6 ; Row 2 vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m4, m1 pmaddubsw m4, m0 pmaddwd m4, m2 ; Row 3 vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] pshufb m5, m1 pmaddubsw m5, m0 pmaddwd m5, m2 packssdw m4, m5 pmulhrsw m4, m6 packuswb m3, m4 vextracti128 xm4, m3, 1 movd [r2], xm3 pextrw [r2 + 4], xm4, 0 pextrd [r2 + r3], xm3, 1 pextrw [r2 + r3 + 4], xm4, 2 pextrd [r2 + r3 * 2], xm3, 2 pextrw [r2 + r3 * 2 + 4], xm4, 4 pextrd [r2 + r5], xm3, 3 pextrw [r2 + r5 + 4], xm4, 6 lea r2, [r2 + r3 * 4] lea r0, [r0 + r1 * 4] %endrep RET davs2-1.6/source/common/x86/ipfilter8.h000066400000000000000000000047671337322544400177200ustar00rootroot00000000000000/***************************************************************************** * Copyright (C) 2013-2017 MulticoreWare, Inc * * Authors: Steve Borho * Jiaqi Zhang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at license @ x265.com. *****************************************************************************/ #ifndef DAVS2_IPFILTER8_H #define DAVS2_IPFILTER8_H #include "../vec/intrinsic.h" #if defined(__cplusplus) extern "C" { #endif /* __cplusplus */ #define SETUP_FUNC_DEF(cpu) \ FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pel_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \ FUNCDEF_PU(void, interp_8tap_vert_pp, cpu, const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_PU(void, interp_8tap_vert_ps, cpu, const pel_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_PU(void, interp_8tap_vert_sp, cpu, const int16_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \ FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pel_t* src, intptr_t srcStride, pel_t* dst, intptr_t dstStride, int idxX, int idxY) SETUP_FUNC_DEF(sse2); SETUP_FUNC_DEF(ssse3); SETUP_FUNC_DEF(sse3); SETUP_FUNC_DEF(sse4); SETUP_FUNC_DEF(avx2); #if defined(__cplusplus) } #endif /* __cplusplus */ #endif // ifndef DAVS2_IPFILTER8_H davs2-1.6/source/common/x86/mc-a2.asm000066400000000000000000001443751337322544400172420ustar00rootroot00000000000000;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** ;* Copyright (C) 2003-2013 x264 project ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Holger Lubitz ;* Mathieu Monnier ;* Oskar Arvidsson ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7 db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15 const pq_256, times 4 dq 256.0 const pd_inv256, times 4 dq 0.00390625 const pd_0_5, times 4 dq 0.5 SECTION .text cextern pb_0 cextern pw_1 cextern pw_16 cextern pw_32 cextern pw_512 cextern pw_00ff cextern pw_1024 cextern pw_3fff cextern pw_pixel_max cextern pd_ffff cextern pd_16 ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. ;Doing the hpel_filter temporal may be a win if the last level cache ;is big enough (preliminary benching suggests on the order of 4* framesize). ;%define movntq movq ;%define movntps movaps ;%define sfence %if HIGH_BIT_DEPTH == 0 %undef movntq %undef movntps %undef sfence %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void plane_copy_core( pixel *dst, intptr_t i_dst, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w INIT_MMX cglobal plane_copy_core_mmx2, 6,7 FIX_STRIDES r1, r3, r4d %if HIGH_BIT_DEPTH == 0 movsxdifnidn r4, r4d %endif sub r1, r4 sub r3, r4 .loopy: lea r6d, [r4-63] .loopx: prefetchnta [r2+256] movq m0, [r2 ] movq m1, [r2+ 8] movntq [r0 ], m0 movntq [r0+ 8], m1 movq m2, [r2+16] movq m3, [r2+24] movntq [r0+16], m2 movntq [r0+24], m3 movq m4, [r2+32] movq m5, [r2+40] movntq [r0+32], m4 movntq [r0+40], m5 movq m6, [r2+48] movq m7, [r2+56] movntq [r0+48], m6 movntq [r0+56], m7 add r2, 64 add r0, 64 sub r6d, 64 jg .loopx prefetchnta [r2+256] add r6d, 63 jle .end16 .loop16: movq m0, [r2 ] movq m1, [r2+8] movntq [r0 ], m0 movntq [r0+8], m1 add r2, 16 add r0, 16 sub r6d, 16 jg .loop16 .end16: add r0, r1 add r2, r3 dec r5d jg .loopy sfence emms RET %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH %assign x 0 %rep 16/mmsize mov%4 m0, [%2+(x/2)*mmsize] mov%4 m1, [%3+(x/2)*mmsize] punpckhwd m2, m0, m1 punpcklwd m0, m1 mov%5a [%1+(x+0)*mmsize], m0 mov%5a [%1+(x+1)*mmsize], m2 %assign x (x+2) %endrep %else movq m0, [%2] %if mmsize==16 %ifidn %4, a punpcklbw m0, [%3] %else movq m1, [%3] punpcklbw m0, m1 %endif mov%5a [%1], m0 %else movq m1, [%3] punpckhbw m2, m0, m1 punpcklbw m0, m1 mov%5a [%1+0], m0 mov%5a [%1+8], m2 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned %if HIGH_BIT_DEPTH %assign n 0 %rep 16/mmsize mova m0, [%3+(n+0)*mmsize] mova m1, [%3+(n+1)*mmsize] psrld m2, m0, 16 psrld m3, m1, 16 pand m0, %5 pand m1, %5 packssdw m0, m1 packssdw m2, m3 mov%6 [%1+(n/2)*mmsize], m0 mov%6 [%2+(n/2)*mmsize], m2 %assign n (n+2) %endrep %else ; !HIGH_BIT_DEPTH %if mmsize==16 mova m0, [%3] %if cpuflag(ssse3) pshufb m0, %5 %else mova m1, m0 pand m0, %5 psrlw m1, 8 packuswb m0, m1 %endif %if %4 mova [%1], m0 %else movq [%1], m0 movhps [%2], m0 %endif %else mova m0, [%3] mova m1, [%3+8] mova m2, m0 mova m3, m1 pand m0, %5 pand m1, %5 psrlw m2, 8 psrlw m3, 8 packuswb m0, m1 packuswb m2, m3 mova [%1], m0 mova [%2], m2 %endif ; mmsize == 16 %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_INTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, ; uint8_t *srcu, intptr_t i_srcu, ; uint8_t *srcv, intptr_t i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2*w cglobal plane_copy_interleave_core, 6,9 mov r6d, r6m %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3, r5, r6d movifnidn r1mp, r1 movifnidn r3mp, r3 mov r6m, r6d %endif lea r0, [r0+r6*2] add r2, r6 add r4, r6 %if ARCH_X86_64 DECLARE_REG_TMP 7,8 %else DECLARE_REG_TMP 1,3 %endif mov t1, r1 shr t1, SIZEOF_PIXEL sub t1, r6 mov t0d, r7m .loopy: mov r6d, r6m neg r6 .prefetch: prefetchnta [r2+r6] prefetchnta [r4+r6] add r6, 64 jl .prefetch mov r6d, r6m neg r6 .loopx: INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt add r6, 16*SIZEOF_PIXEL jl .loopx .pad: %assign n 0 %rep SIZEOF_PIXEL %if mmsize==8 movntq [r0+r6*2+(n+ 0)], m0 movntq [r0+r6*2+(n+ 8)], m0 movntq [r0+r6*2+(n+16)], m0 movntq [r0+r6*2+(n+24)], m0 %else movntdq [r0+r6*2+(n+ 0)], m0 movntdq [r0+r6*2+(n+16)], m0 %endif %assign n n+32 %endrep add r6, 16*SIZEOF_PIXEL cmp r6, t1 jl .pad add r0, r1mp add r2, r3mp add r4, r5 dec t0d jg .loopy sfence emms RET ;----------------------------------------------------------------------------- ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- cglobal store_interleave_chroma, 5,5 FIX_STRIDES r1 .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a add r2, FDEC_STRIDEB*2 add r3, FDEC_STRIDEB*2 lea r0, [r0+r1*2] sub r4d, 2 jg .loop RET %endmacro ; PLANE_INTERLEAVE %macro DEINTERLEAVE_START 0 %if HIGH_BIT_DEPTH mova m4, [pd_ffff] %elif cpuflag(ssse3) mova m4, [deinterleave_shuf] %else mova m4, [pw_00ff] %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, ; pixel *dstv, intptr_t i_dstv, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- cglobal plane_copy_deinterleave, 6,7 DEINTERLEAVE_START mov r6d, r6m FIX_STRIDES r1, r3, r5, r6d %if HIGH_BIT_DEPTH mov r6m, r6d %endif add r0, r6 add r2, r6 lea r4, [r4+r6*2] .loopy: mov r6d, r6m neg r6 .loopx: DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u add r6, 16*SIZEOF_PIXEL jl .loopx add r0, r1 add r2, r3 add r4, r5 dec dword r7m jg .loopy RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a add r0, FENC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a add r0, FDEC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET %endmacro ; PLANE_DEINTERLEAVE %if HIGH_BIT_DEPTH INIT_MMX mmx2 PLANE_INTERLEAVE INIT_MMX mmx PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE INIT_XMM avx PLANE_INTERLEAVE PLANE_DEINTERLEAVE %else INIT_MMX mmx2 PLANE_INTERLEAVE INIT_MMX mmx PLANE_DEINTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE INIT_XMM ssse3 PLANE_DEINTERLEAVE %endif ; These functions are not general-use; not only do the SSE ones require aligned input, ; but they also will fail if given a non-mod16 size. ; memzero SSE will fail for non-mod128. ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- %macro MEMCPY 0 cglobal memcpy_aligned, 3,3 %if mmsize == 16 test r2d, 16 jz .copy2 mova m0, [r1+r2-16] mova [r0+r2-16], m0 sub r2d, 16 .copy2: %endif test r2d, 2*mmsize jz .copy4start mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 sub r2d, 2*mmsize .copy4start: test r2d, r2d jz .ret .copy4: mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova m2, [r1+r2-3*mmsize] mova m3, [r1+r2-4*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 mova [r0+r2-3*mmsize], m2 mova [r0+r2-4*mmsize], m3 sub r2d, 4*mmsize jg .copy4 .ret: REP_RET %endmacro INIT_MMX mmx MEMCPY INIT_XMM sse MEMCPY ; ---------------------------------------------------------------------------- ; void *fast_memcpy( void *dst, const void *src, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memcpy, 3,5,8 ;{ test r2, r2 ; if n = 0, quit jz .L_QUIT ; ; mov r3, r2 ; r3 <-- r2, copy sar r2, 3 ; r2 <-- n/8 and r3, 0x07 ; r3 <-- n%8 prefetchnta [r1] ; prefetch ahead, non-temporal ; ; cal hexnum/8 and remainder/8 and store ; mov r4, r2 ; r4 <-- r2, copy sar r2, 3 ; r2 <-- (n/8)/8 and r4, 0x07 ; r4 <-- (n/8)%8 cmp r2, 0 ; je .HEX_ZERO ; ; align 4 ; .L_COPY_64X: ; prefetchnta [r1 + 128] ; prefetch ahead, non-temporal prefetchnta [r1 + 256] ; prefetch ahead, non-temporal ; ; load 64 bytes data form src ; movq m0, [r1 + 0*8] ; load 8 bytes movq m1, [r1 + 1*8] ; load 8 bytes movq m2, [r1 + 2*8] ; load 8 bytes movq m3, [r1 + 3*8] ; load 8 bytes movq m4, [r1 + 4*8] ; load 8 bytes movq m5, [r1 + 5*8] ; load 8 bytes movq m6, [r1 + 6*8] ; load 8 bytes movq m7, [r1 + 7*8] ; load 8 bytes ; ; store the 64 bytes to dst ; movntq [r0 + 0*8], m0 ; store 8 bytes movntq [r0 + 1*8], m1 ; store 8 bytes movntq [r0 + 2*8], m2 ; store 8 bytes movntq [r0 + 3*8], m3 ; store 8 bytes movntq [r0 + 4*8], m4 ; store 8 bytes movntq [r0 + 5*8], m5 ; store 8 bytes movntq [r0 + 6*8], m6 ; store 8 bytes movntq [r0 + 7*8], m7 ; store 8 bytes ; add r1, 64 ; add r0, 64 ; dec r2 ; jnz .L_COPY_64X ; ; .HEX_ZERO: ; cmp r4, 0 ; je .L_RESIDUAL ; ; .L_COPY_8X: ; movq m3, [r1] ; load 8 bytes movntq [r0], m3 ; store 8 bytes add r1, 8 ; add r0, 8 ; dec r4 ; jnz .L_COPY_8X ; ; .L_RESIDUAL: ; ; quit ; cmp r3, 0 ; je .L_QUIT ; ; .L_COPY_1X: ; mov r2b, [r1] ; mov [r0], r2b ; add r1, 1 ; add r0, 1 ; dec r3 ; jnz .L_COPY_1X ; ; .L_QUIT: ; sfence ; emms ; RET ; ;} ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- %macro MEMZERO 1 cglobal memzero_aligned, 2,2 add r0, r1 neg r1 %if mmsize == 8 pxor m0, m0 %else xorps m0, m0 %endif .loop: %assign i 0 %rep %1 mova [r0 + r1 + i], m0 %assign i i+mmsize %endrep add r1, mmsize*%1 jl .loop RET %endmacro INIT_MMX mmx MEMZERO 8 INIT_XMM sse MEMZERO 8 INIT_YMM avx MEMZERO 4 ; ---------------------------------------------------------------------------- ; void *fast_memzero( void *dst, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memzero, 2,3,1 ;{ test r1, r1 ; if n = 0, quit jz .L_QUIT ; mov r2, r1 ; r2 <-- r1 = n, copy sar r1, 3 ; r1 = n/8 and r2, 7 ; r2 = n%8 cmp r1, 0 ; n/8 = 0? je .HEX_ZERO ; jump if n < 8 pxor m0, m0 ; clear m0 ; .L_SET_8X: ; movntq [r0], m0 ; clear 8 bytes add r0, 8 ; r0 = r0 + 8 dec r1 ; r1 = r1 - 1 jnz .L_SET_8X ; loop until r1 = 0 ; .HEX_ZERO: ; xor r1, r1 ; clear r1 cmp r2, 0 ; n%8 = 0? je .L_QUIT ; ; .L_RESIDUAL: ; mov [r0], r1b ; mov 1 byte add r0, 1 ; dec r2 ; jnz .L_RESIDUAL ; ; .L_QUIT: ; emms ; RET ; ;} ; ---------------------------------------------------------------------------- ; void *fast_memset( void *dst, int val, size_t n ); ; ---------------------------------------------------------------------------- INIT_MMX mmx cglobal fast_memset, 3,4,1 ;{ test r2, r2 ; if n = 0, quit jz .L_QUIT ; mov r3, r2 ; r3 <-- r2 = n, copy sar r2, 3 ; r2 = n/8 and r3, 7 ; r3 = n%8 cmp r2, 0 ; n/8 = 0? je .HEX_ZERO ; jump if n < 8 movd m0, r1d ; m0[ 0] = val (DWORD) pshufw m0, m0, 0 ; m0[ 3 2 1 0] = val (WORD) packsswb m0, m0 ; m0[76543210] = val (BYTE) ; .L_SET_8X: ; movntq [r0], m0 ; clear 8 bytes add r0, 8 ; r0 = r0 + 8 dec r2 ; r2 = r2 - 1 jnz .L_SET_8X ; loop until r2 = 0 ; .HEX_ZERO: ; cmp r3, 0 ; n%8 = 0? je .L_QUIT ; ; .L_RESIDUAL: ; mov [r0], r1b ; mov 1 byte add r0, 1 ; dec r3 ; jnz .L_RESIDUAL ; ; .L_QUIT: ; emms ; RET ; ;} ; ------------------------------------------------------------------ ; param 1: dst, param 2: src stride ; r0 -- src %macro FILT_8x2 2 mova m3, [r0 + 8] mova m2, [r0 ] pavgb m3, [r0 + %2 + 8] pavgb m2, [r0 + %2 ] mova m1, [r0 + 9] mova m0, [r0 + 1] pavgb m1, [r0 + %2 + 9] pavgb m0, [r0 + %2 + 1] pavgb m1, m3 pavgb m0, m2 pand m1, m7 pand m0, m7 packuswb m0, m1 movu [%1], m0 %endmacro ; ------------------------------------------------------------------ ; param 1: dst, param 2: src stride ; r0 -- src %macro FILT_16x2 2 mova m3, [r0 + mmsize] mova m2, [r0 ] pavgb m3, [r0 + %2 + mmsize] pavgb m2, [r0 + %2 ] PALIGNR m0, m3, 1, m6 pavgb m0, m3 PALIGNR m3, m2, 1, m6 pavgb m3, m2 pand m0, m7 pand m3, m7 packuswb m3, m0 movu [%1], m3 %endmacro ; ---------------------------------------------------------------------------- ; void lowres_filter_core_c( pel_t *src, int i_src, pel_t *dst, int i_dst, ; int width, int height ) ; ---------------------------------------------------------------------------- %macro LOWRES_FILTER_CORE 0 cglobal lowres_filter_core, 6,7,8 %if mmsize >= 16 ; add r4, mmsize-1 ; and r4, ~(mmsize-1) ; %endif ; ; src += 2*[(height-1)*i_src + width] ; mov r6d, r5d ; r6 <-- height dec r6d ; r6 <-- (height - 1) imul r6d, r1d ; r6 <-- (height - 1) * i_src add r6d, r4d ; r6 <-- (height - 1) * i_src + width lea r0, [r0+r6*2] ; r0 <== src + 2*((height - 1) * i_src + width) ; dst += (height-1)*stride + width ; mov r6d, r5d ; r6 <-- height dec r6d ; r6 <-- (height - 1) imul r6d, r3d ; r6 <-- (height - 1) * i_dst add r6d, r4d ; r6 <-- (height - 1) * i_dst + width add r2, r6 ; r2 <== dst + (height - 1) * i_dst + width ; gap of src and dst in each line ; sub r3d, r4d ; r3 <== i_dst - width // dst gap mov r6d, r1d ; r6 <-- i_src sub r6d, r4d ; r6 <-- i_src - width shl r6d, 1 ; r6 <-- 2 * (i_src - width) PUSH r6 ; src gap %define src_gap [rsp] ; ; pcmpeqb m7, m7 ; m7 <-- [FFFF...FFFF] psrlw m7, 8 ; m7 <-- [00FF...00FF] ; .vloop: ; ==== for (; height>0; height--) { mov r6d, r4d ; r6 <-- width %ifnidn cpuname, mmx2 ; %if mmsize <= 16 ; mova m0, [r0 ] ; load from src mova m1, [r0 + r1] ; load from down line pavgb m0, m1 ; m0 <-- average of 2 lines %endif ; %endif ; .hloop: ; -------- for (; width>0; width-=mmsize) { sub r0, mmsize*2 ; src -= mmsize * 2 sub r2, mmsize ; dst -= mmsize %ifidn cpuname, mmx2 ; FILT_8x2 r2, r1 ; %else ; FILT_16x2 r2, r1 ; %endif ; sub r6d, mmsize ; r6 -= mmsize jg .hloop ; -------- } // end for (width...) ; .skip: ; sub r0, src_gap ; sub r2, r3 ; dec r5d ; jg .vloop ; ==== } // end for (height...) ADD rsp, gprsize ; emms ; RET ; %endmacro ; LOWRES_FILTER_CORE INIT_MMX mmx2 LOWRES_FILTER_CORE ; lowres_filter_core_mmx2 INIT_XMM sse2 LOWRES_FILTER_CORE ; lowres_filter_core_sse2 INIT_XMM ssse3 LOWRES_FILTER_CORE ; lowres_filter_core_ssse3 INIT_XMM avx LOWRES_FILTER_CORE ; lowres_filter_core_avx ; %if HIGH_BIT_DEPTH == 0 ; ;----------------------------------------------------------------------------- ; ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; %macro INTEGRAL_INIT4H 0 ; cglobal integral_init4h, 3,4 ; lea r3, [r0+r2*2] ; add r1, r2 ; neg r2 ; pxor m4, m4 ; .loop: ; mova m0, [r1+r2] ; %if mmsize==32 ; movu m1, [r1+r2+8] ; %else ; mova m1, [r1+r2+16] ; palignr m1, m0, 8 ; %endif ; mpsadbw m0, m4, 0 ; mpsadbw m1, m4, 0 ; paddw m0, [r0+r2*2] ; paddw m1, [r0+r2*2+mmsize] ; mova [r3+r2*2 ], m0 ; mova [r3+r2*2+mmsize], m1 ; add r2, mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_XMM sse4 ; INTEGRAL_INIT4H ; INIT_YMM avx2 ; INTEGRAL_INIT4H ; ; %macro INTEGRAL_INIT8H 0 ; cglobal integral_init8h, 3,4 ; lea r3, [r0+r2*2] ; add r1, r2 ; neg r2 ; pxor m4, m4 ; .loop: ; mova m0, [r1+r2] ; %if mmsize==32 ; movu m1, [r1+r2+8] ; mpsadbw m2, m0, m4, 100100b ; mpsadbw m3, m1, m4, 100100b ; %else ; mova m1, [r1+r2+16] ; palignr m1, m0, 8 ; mpsadbw m2, m0, m4, 100b ; mpsadbw m3, m1, m4, 100b ; %endif ; mpsadbw m0, m4, 0 ; mpsadbw m1, m4, 0 ; paddw m0, [r0+r2*2] ; paddw m1, [r0+r2*2+mmsize] ; paddw m0, m2 ; paddw m1, m3 ; mova [r3+r2*2 ], m0 ; mova [r3+r2*2+mmsize], m1 ; add r2, mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_XMM sse4 ; INTEGRAL_INIT8H ; INIT_XMM avx ; INTEGRAL_INIT8H ; INIT_YMM avx2 ; INTEGRAL_INIT8H ; %endif ; !HIGH_BIT_DEPTH ; ; %macro INTEGRAL_INIT_8V 0 ; ;----------------------------------------------------------------------------- ; ; void integral_init8v( uint16_t *sum8, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; cglobal integral_init8v, 3,3 ; add r1, r1 ; add r0, r1 ; lea r2, [r0+r1*8] ; neg r1 ; .loop: ; mova m0, [r2+r1] ; mova m1, [r2+r1+mmsize] ; psubw m0, [r0+r1] ; psubw m1, [r0+r1+mmsize] ; mova [r0+r1], m0 ; mova [r0+r1+mmsize], m1 ; add r1, 2*mmsize ; jl .loop ; RET ; %endmacro ; ; INIT_MMX mmx ; INTEGRAL_INIT_8V ; INIT_XMM sse2 ; INTEGRAL_INIT_8V ; INIT_YMM avx2 ; INTEGRAL_INIT_8V ; ; ;----------------------------------------------------------------------------- ; ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ; ;----------------------------------------------------------------------------- ; INIT_MMX mmx ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; mova m0, [r0+r2] ; mova m4, [r4+r2] ; .loop: ; mova m1, m4 ; psubw m1, m0 ; mova m4, [r4+r2-8] ; mova m0, [r0+r2-8] ; paddw m1, m4 ; mova m3, [r3+r2-8] ; psubw m1, m0 ; psubw m3, m0 ; mova [r0+r2-8], m1 ; mova [r1+r2-8], m3 ; sub r2, 8 ; jge .loop ; RET ; ; INIT_XMM sse2 ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m0, [r0+r2] ; mova m1, [r4+r2] ; mova m2, m0 ; mova m4, m1 ; shufpd m0, [r0+r2+16], 1 ; shufpd m1, [r4+r2+16], 1 ; paddw m0, m2 ; paddw m1, m4 ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 16 ; jl .loop ; RET ; ; INIT_XMM ssse3 ; cglobal integral_init4v, 3,5 ; shl r2, 1 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m2, [r0+r2] ; mova m0, [r0+r2+16] ; mova m4, [r4+r2] ; mova m1, [r4+r2+16] ; palignr m0, m2, 8 ; palignr m1, m4, 8 ; paddw m0, m2 ; paddw m1, m4 ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 16 ; jl .loop ; RET ; ; INIT_YMM avx2 ; cglobal integral_init4v, 3,5 ; add r2, r2 ; add r0, r2 ; add r1, r2 ; lea r3, [r0+r2*4] ; lea r4, [r0+r2*8] ; neg r2 ; .loop: ; mova m2, [r0+r2] ; movu m1, [r4+r2+8] ; paddw m0, m2, [r0+r2+8] ; paddw m1, [r4+r2] ; mova m3, [r3+r2] ; psubw m1, m0 ; psubw m3, m2 ; mova [r0+r2], m1 ; mova [r1+r2], m3 ; add r2, 32 ; jl .loop ; RET ; ; %macro FILT8x4 7 ; mova %3, [r0+%7] ; mova %4, [r0+r5+%7] ; pavgb %3, %4 ; pavgb %4, [r0+r5*2+%7] ; PALIGNR %1, %3, 1, m6 ; PALIGNR %2, %4, 1, m6 ; %if cpuflag(xop) ; pavgb %1, %3 ; pavgb %2, %4 ; %else ; pavgb %1, %3 ; pavgb %2, %4 ; psrlw %5, %1, 8 ; psrlw %6, %2, 8 ; pand %1, m7 ; pand %2, m7 ; %endif ; %endmacro ; ; %macro FILT32x4U 4 ; movu m1, [r0+r5] ; pavgb m0, m1, [r0] ; movu m3, [r0+r5+1] ; pavgb m2, m3, [r0+1] ; pavgb m1, [r0+r5*2] ; pavgb m3, [r0+r5*2+1] ; pavgb m0, m2 ; pavgb m1, m3 ; ; movu m3, [r0+r5+mmsize] ; pavgb m2, m3, [r0+mmsize] ; movu m5, [r0+r5+1+mmsize] ; pavgb m4, m5, [r0+1+mmsize] ; pavgb m3, [r0+r5*2+mmsize] ; pavgb m5, [r0+r5*2+1+mmsize] ; pavgb m2, m4 ; pavgb m3, m5 ; ; pshufb m0, m7 ; pshufb m1, m7 ; pshufb m2, m7 ; pshufb m3, m7 ; punpckhqdq m4, m0, m2 ; punpcklqdq m0, m0, m2 ; punpckhqdq m5, m1, m3 ; punpcklqdq m2, m1, m3 ; vpermq m0, m0, q3120 ; vpermq m1, m4, q3120 ; vpermq m2, m2, q3120 ; vpermq m3, m5, q3120 ; movu [%1], m0 ; movu [%2], m1 ; movu [%3], m2 ; movu [%4], m3 ; %endmacro ; ; %macro FILT16x2 4 ; mova m3, [r0+%4+mmsize] ; mova m2, [r0+%4] ; pavgb m3, [r0+%4+r5+mmsize] ; pavgb m2, [r0+%4+r5] ; PALIGNR %1, m3, 1, m6 ; pavgb %1, m3 ; PALIGNR m3, m2, 1, m6 ; pavgb m3, m2 ; %if cpuflag(xop) ; vpperm m5, m3, %1, m7 ; vpperm m3, m3, %1, m6 ; %else ; psrlw m5, m3, 8 ; psrlw m4, %1, 8 ; pand m3, m7 ; pand %1, m7 ; packuswb m3, %1 ; packuswb m5, m4 ; %endif ; mova [%2], m3 ; mova [%3], m5 ; mova %1, m2 ; %endmacro ; ; %macro FILT8x2U 3 ; mova m3, [r0+%3+8] ; mova m2, [r0+%3] ; pavgb m3, [r0+%3+r5+8] ; pavgb m2, [r0+%3+r5] ; mova m1, [r0+%3+9] ; mova m0, [r0+%3+1] ; pavgb m1, [r0+%3+r5+9] ; pavgb m0, [r0+%3+r5+1] ; pavgb m1, m3 ; pavgb m0, m2 ; psrlw m3, m1, 8 ; psrlw m2, m0, 8 ; pand m1, m7 ; pand m0, m7 ; packuswb m0, m1 ; packuswb m2, m3 ; mova [%1], m0 ; mova [%2], m2 ; %endmacro ; ; %macro FILT8xU 3 ; mova m3, [r0+%3+8] ; mova m2, [r0+%3] ; pavgw m3, [r0+%3+r5+8] ; pavgw m2, [r0+%3+r5] ; movu m1, [r0+%3+10] ; movu m0, [r0+%3+2] ; pavgw m1, [r0+%3+r5+10] ; pavgw m0, [r0+%3+r5+2] ; pavgw m1, m3 ; pavgw m0, m2 ; psrld m3, m1, 16 ; psrld m2, m0, 16 ; pand m1, m7 ; pand m0, m7 ; packssdw m0, m1 ; packssdw m2, m3 ; movu [%1], m0 ; mova [%2], m2 ; %endmacro ; ; %macro FILT8xA 4 ; movu m3, [r0+%4+mmsize] ; movu m2, [r0+%4] ; pavgw m3, [r0+%4+r5+mmsize] ; pavgw m2, [r0+%4+r5] ; PALIGNR %1, m3, 2, m6 ; pavgw %1, m3 ; PALIGNR m3, m2, 2, m6 ; pavgw m3, m2 ; %if cpuflag(xop) ; vpperm m5, m3, %1, m7 ; vpperm m3, m3, %1, m6 ; %else ; psrld m5, m3, 16 ; psrld m4, %1, 16 ; pand m3, m7 ; pand %1, m7 ; packssdw m3, %1 ; packssdw m5, m4 ; %endif ; %if cpuflag(avx2) ; vpermq m3, m3, q3120 ; vpermq m5, m5, q3120 ; %endif ; movu [%2], m3 ; movu [%3], m5 ; movu %1, m2 ; %endmacro ; ; ;----------------------------------------------------------------------------- ; ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; ; intptr_t src_stride, intptr_t dst_stride, int width, int height ) ; ;----------------------------------------------------------------------------- ; %macro FRAME_INIT_LOWRES 0 ; cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise ; %if HIGH_BIT_DEPTH ; shl dword r6m, 1 ; FIX_STRIDES r5 ; shl dword r7m, 1 ; %endif ; %if mmsize >= 16 ; add dword r7m, mmsize-1 ; and dword r7m, ~(mmsize-1) ; %endif ; ; src += 2*(height-1)*stride + 2*width ; mov r6d, r8m ; dec r6d ; imul r6d, r5d ; add r6d, r7m ; lea r0, [r0+r6*2] ; ; dst += (height-1)*stride + width ; mov r6d, r8m ; dec r6d ; imul r6d, r6m ; add r6d, r7m ; add r1, r6 ; add r2, r6 ; add r3, r6 ; add r4, r6 ; ; gap = stride - width ; mov r6d, r6m ; sub r6d, r7m ; PUSH r6 ; %define dst_gap [rsp+gprsize] ; mov r6d, r5d ; sub r6d, r7m ; shl r6d, 1 ; PUSH r6 ; %define src_gap [rsp] ; %if HIGH_BIT_DEPTH ; %if cpuflag(xop) ; mova m6, [deinterleave_shuf32a] ; mova m7, [deinterleave_shuf32b] ; %else ; pcmpeqw m7, m7 ; psrld m7, 16 ; %endif ; .vloop: ; mov r6d, r7m ; %ifnidn cpuname, mmx2 ; movu m0, [r0] ; movu m1, [r0+r5] ; pavgw m0, m1 ; pavgw m1, [r0+r5*2] ; %endif ; .hloop: ; sub r0, mmsize*2 ; sub r1, mmsize ; sub r2, mmsize ; sub r3, mmsize ; sub r4, mmsize ; %ifidn cpuname, mmx2 ; FILT8xU r1, r2, 0 ; FILT8xU r3, r4, r5 ; %else ; FILT8xA m0, r1, r2, 0 ; FILT8xA m1, r3, r4, r5 ; %endif ; sub r6d, mmsize ; jg .hloop ; %else ; !HIGH_BIT_DEPTH ; %if cpuflag(avx2) ; mova m7, [deinterleave_shuf] ; %elif cpuflag(xop) ; mova m6, [deinterleave_shuf32a] ; mova m7, [deinterleave_shuf32b] ; %else ; pcmpeqb m7, m7 ; psrlw m7, 8 ; %endif ; .vloop: ; mov r6d, r7m ; %ifnidn cpuname, mmx2 ; %if mmsize <= 16 ; mova m0, [r0] ; mova m1, [r0+r5] ; pavgb m0, m1 ; pavgb m1, [r0+r5*2] ; %endif ; %endif ; .hloop: ; sub r0, mmsize*2 ; sub r1, mmsize ; sub r2, mmsize ; sub r3, mmsize ; sub r4, mmsize ; %if mmsize==32 ; FILT32x4U r1, r2, r3, r4 ; %elifdef m8 ; FILT8x4 m0, m1, m2, m3, m10, m11, mmsize ; mova m8, m0 ; mova m9, m1 ; FILT8x4 m2, m3, m0, m1, m4, m5, 0 ; %if cpuflag(xop) ; vpperm m4, m2, m8, m7 ; vpperm m2, m2, m8, m6 ; vpperm m5, m3, m9, m7 ; vpperm m3, m3, m9, m6 ; %else ; packuswb m2, m8 ; packuswb m3, m9 ; packuswb m4, m10 ; packuswb m5, m11 ; %endif ; mova [r1], m2 ; mova [r2], m4 ; mova [r3], m3 ; mova [r4], m5 ; %elifidn cpuname, mmx2 ; FILT8x2U r1, r2, 0 ; FILT8x2U r3, r4, r5 ; %else ; FILT16x2 m0, r1, r2, 0 ; FILT16x2 m1, r3, r4, r5 ; %endif ; sub r6d, mmsize ; jg .hloop ; %endif ; HIGH_BIT_DEPTH ; .skip: ; mov r6, dst_gap ; sub r0, src_gap ; sub r1, r6 ; sub r2, r6 ; sub r3, r6 ; sub r4, r6 ; dec dword r8m ; jg .vloop ; ADD rsp, 2*gprsize ; emms ; RET ; %endmacro ; FRAME_INIT_LOWRES ; ; INIT_MMX mmx2 ; FRAME_INIT_LOWRES ; %if ARCH_X86_64 == 0 ; INIT_MMX cache32, mmx2 ; FRAME_INIT_LOWRES ; %endif ; INIT_XMM sse2 ; FRAME_INIT_LOWRES ; INIT_XMM ssse3 ; FRAME_INIT_LOWRES ; INIT_XMM avx ; FRAME_INIT_LOWRES ; INIT_XMM xop ; FRAME_INIT_LOWRES ; %if ARCH_X86_64 == 1 ; INIT_YMM avx2 ; FRAME_INIT_LOWRES ; %endif ; ; ;----------------------------------------------------------------------------- ; ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs, ; ; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len ) ; ;----------------------------------------------------------------------------- ; INIT_XMM sse2 ; cglobal mbtree_propagate_cost, 7,7,7 ; dec r6d ; movsd m6, [r5] ; mulpd m6, [pd_inv256] ; xor r5d, r5d ; lea r0, [r0+r5*2] ; pxor m4, m4 ; movlhps m6, m6 ; mova m5, [pw_3fff] ; ; .loop: ; movh m2, [r2+r5*4] ; intra ; movh m0, [r4+r5*4] ; invq ; movd m3, [r3+r5*2] ; inter ; pand m3, m5 ; punpcklwd m3, m4 ; ; ; PMINSD ; pcmpgtd m1, m2, m3 ; pand m3, m1 ; pandn m1, m2 ; por m3, m1 ; ; movd m1, [r1+r5*2] ; prop ; punpckldq m2, m2 ; punpckldq m0, m0 ; pmuludq m0, m2 ; pshufd m2, m2, q3120 ; pshufd m0, m0, q3120 ; ; punpcklwd m1, m4 ; cvtdq2pd m0, m0 ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; cvtdq2pd m1, m1 ; prop ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;cvtdq2ps m1, m2 ; intra ; cvtdq2pd m1, m2 ; intra ; psubd m2, m3 ; intra - inter ; cvtdq2pd m2, m2 ; intra - inter ; ;rcpps m3, m1 ; ;mulps m1, m3 ; intra * (1/intra 1st approx) ; ;mulps m1, m3 ; intra * (1/intra 1st approx)^2 ; ;addps m3, m3 ; 2 * (1/intra 1st approx) ; ;subps m3, m1 ; 2nd approximation for 1/intra ; ;cvtps2pd m3, m3 ; 1 / intra 1st approximation ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ;mulpd m0, m3 ; / intra ; ; ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq m0, m0 ; ; movh [r0+r5*4], m0 ; add r5d, 2 ; cmp r5d, r6d ; jl .loop ; ; xor r6d, r5d ; jnz .even ; movd m2, [r2+r5*4] ; intra ; movd m0, [r4+r5*4] ; invq ; movd m3, [r3+r5*2] ; inter ; pand m3, m5 ; punpcklwd m3, m4 ; ; ; PMINSD ; pcmpgtd m1, m2, m3 ; pand m3, m1 ; pandn m1, m2 ; por m3, m1 ; ; movd m1, [r1+r5*2] ; prop ; punpckldq m2, m2 ; DWORD [_ 1 _ 0] ; punpckldq m0, m0 ; pmuludq m0, m2 ; QWORD [m1 m0] ; pshufd m2, m2, q3120 ; pshufd m0, m0, q3120 ; punpcklwd m1, m4 ; cvtdq2pd m0, m0 ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; cvtdq2pd m1, m1 ; prop ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; cvtdq2pd m1, m2 ; intra ; psubd m2, m3 ; intra - inter ; cvtdq2pd m2, m2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq m0, m0 ; movd [r0+r5*4], m0 ; .even: ; RET ; ; ; ;----------------------------------------------------------------------------- ; ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs, ; ; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len ) ; ;----------------------------------------------------------------------------- ; ; FIXME: align loads/stores to 16 bytes ; %macro MBTREE_AVX 0 ; cglobal mbtree_propagate_cost, 7,7,7 ; sub r6d, 3 ; vbroadcastsd m6, [r5] ; mulpd m6, [pd_inv256] ; xor r5d, r5d ; mova m5, [pw_3fff] ; ; .loop: ; movu xm2, [r2+r5*4] ; intra ; movu xm0, [r4+r5*4] ; invq ; pmovzxwd xm3, [r3+r5*2] ; inter ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; pmovzxwd xm1, [r1+r5*2] ; prop ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; ; movu [r0+r5*4], xm0 ; add r5d, 4 ; process 4 values in one iteration ; cmp r5d, r6d ; jl .loop ; ; add r6d, 3 ; xor r6d, r5d ; jz .even ; if loop counter is multiple of 4, all values are processed ; ; and r6d, 3 ; otherwise, remaining unprocessed values must be 1, 2 or 3 ; cmp r6d, 1 ; je .process1 ; if only 1 value is unprocessed ; ; ; process 2 values here ; movq xm2, [r2+r5*4] ; intra ; movq xm0, [r4+r5*4] ; invq ; movd xm3, [r3+r5*2] ; inter ; pmovzxwd xm3, xm3 ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; movd xm1, [r1+r5*2] ; prop ; pmovzxwd xm1, xm1 ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; movq [r0+r5*4], xm0 ; ; xor r6d, 2 ; jz .even ; add r5d, 2 ; ; ; process 1 value here ; .process1: ; movd xm2, [r2+r5*4] ; intra ; movd xm0, [r4+r5*4] ; invq ; movzx r6d, word [r3+r5*2] ; inter ; movd xm3, r6d ; pand xm3, xm5 ; pminsd xm3, xm2 ; ; movzx r6d, word [r1+r5*2] ; prop ; movd xm1, r6d ; pmulld xm0, xm2 ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; prop ; ;%if cpuflag(avx2) ; ; fmaddpd m0, m0, m6, m1 ; ;%else ; mulpd m0, m6 ; intra*invq*fps_factor>>8 ; addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) ; ;%endif ; cvtdq2pd m1, xm2 ; intra ; psubd xm2, xm3 ; intra - inter ; cvtdq2pd m2, xm2 ; intra - inter ; mulpd m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) ; ; divpd m0, m1 ; addpd m0, [pd_0_5] ; cvttpd2dq xm0, m0 ; movd [r0+r5*4], xm0 ; .even: ; RET ; %endmacro ; ; INIT_YMM avx ; MBTREE_AVX ; ; INIT_YMM avx2 ; MBTREE_AVX ; ; ; %macro CUTREE_FIX8 0 ; ;----------------------------------------------------------------------------- ; ; void cutree_fix8_pack( uint16_t *dst, double *src, int count ) ; ;----------------------------------------------------------------------------- ; cglobal cutree_fix8_pack, 3, 4, 5 ; movapd m2, [pq_256] ; sub r2d, mmsize / 2 ; movsxdifnidn r2, r2d ; lea r1, [r1 + 8 * r2] ; lea r0, [r0 + 2 * r2] ; neg r2 ; jg .skip_loop ; .loop: ; mulpd m0, m2, [r1 + 8 * r2] ; mulpd m1, m2, [r1 + 8 * r2 + mmsize] ; mulpd m3, m2, [r1 + 8 * r2 + 2 * mmsize] ; mulpd m4, m2, [r1 + 8 * r2 + 3 * mmsize] ; cvttpd2dq xm0, m0 ; cvttpd2dq xm1, m1 ; cvttpd2dq xm3, m3 ; cvttpd2dq xm4, m4 ; %if mmsize == 32 ; vinserti128 m0, m0, xm3, 1 ; vinserti128 m1, m1, xm4, 1 ; packssdw m0, m1 ; %else ; punpcklqdq m0, m1 ; punpcklqdq m3, m4 ; packssdw m0, m3 ; %endif ; mova [r0 + 2 * r2], m0 ; add r2, mmsize / 2 ; jle .loop ; .skip_loop: ; sub r2, mmsize / 2 ; jz .end ; ; Do the remaining values in scalar in order to avoid overreading src. ; .scalar: ; movq xm0, [r1 + 8 * r2 + 4 * mmsize] ; mulsd xm0, xm2 ; cvttsd2si r3d, xm0 ; mov [r0 + 2 * r2 + mmsize], r3w ; inc r2 ; jl .scalar ; .end: ; RET ; ; ;----------------------------------------------------------------------------- ; ; void cutree_fix8_unpack( double *dst, uint16_t *src, int count ) ; ;----------------------------------------------------------------------------- ; cglobal cutree_fix8_unpack, 3, 4, 7 ; %if mmsize != 32 ; mova m4, [cutree_fix8_unpack_shuf+16] ; %endif ; movapd m2, [pd_inv256] ; mova m3, [cutree_fix8_unpack_shuf] ; sub r2d, mmsize / 2 ; movsxdifnidn r2, r2d ; lea r1, [r1 + 2 * r2] ; lea r0, [r0 + 8 * r2] ; neg r2 ; jg .skip_loop ; .loop: ; %if mmsize == 32 ; vbroadcasti128 m0, [r1 + 2 * r2] ; vbroadcasti128 m1, [r1 + 2 * r2 + 16] ; pshufb m0, m3 ; pshufb m1, m3 ; %else ; mova m1, [r1 + 2 * r2] ; pshufb m0, m1, m3 ; pshufb m1, m4 ; %endif ; psrad m0, 16 ; sign-extend ; psrad m1, 16 ; cvtdq2pd m5, xm0 ; cvtdq2pd m6, xm1 ; %if mmsize == 32 ; vpermq m0, m0, q1032 ; vpermq m1, m1, q1032 ; %else ; psrldq m0, 8 ; psrldq m1, 8 ; %endif ; cvtdq2pd m0, xm0 ; cvtdq2pd m1, xm1 ; mulpd m0, m2 ; mulpd m1, m2 ; mulpd m5, m2 ; mulpd m6, m2 ; movapd [r0 + 8 * r2], m5 ; movapd [r0 + 8 * r2 + mmsize], m0 ; movapd [r0 + 8 * r2 + mmsize * 2], m6 ; movapd [r0 + 8 * r2 + mmsize * 3], m1 ; add r2, mmsize / 2 ; jle .loop ; .skip_loop: ; sub r2, mmsize / 2 ; jz .end ; .scalar: ; movzx r3d, word [r1 + 2 * r2 + mmsize] ; movsx r3d, r3w ; cvtsi2sd xm0, r3d ; mulsd xm0, xm2 ; movsd [r0 + 8 * r2 + 4 * mmsize], xm0 ; inc r2 ; jl .scalar ; .end: ; RET ; %endmacro ; ; INIT_XMM ssse3 ; CUTREE_FIX8 ; ; INIT_YMM avx2 ; CUTREE_FIX8 davs2-1.6/source/common/x86/pixeladd8.asm000066400000000000000000001053421337322544400202140ustar00rootroot00000000000000;***************************************************************************** ;* Copyright (C) 2013-2017 MulticoreWare, Inc ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Praveen Kumar Tiwari ;* Min Chen ;* Jiaqi Zhang ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;*****************************************************************************/ %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 SECTION .text cextern pw_pixel_max ;----------------------------------------------------------------------------- ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 add r4, r4 add r5, r5 add r1, r1 movh m2, [r2] movhps m2, [r2 + r4] movh m3, [r3] movhps m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movhps m4, [r2 + r4] movh m5, [r3] movhps m5, [r3 + r5] paddw m2, m3 paddw m4, m5 CLIPW2 m2, m4, m0, m1 movh [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movhps [r0 + r1], m4 RET %else INIT_XMM sse4 cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 add r5, r5 pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movd [r0], m0 movd [r0 + r1], m2 lea r0, [r0 + r1 * 2] movd [r0], m4 movd [r0 + r1], m6 RET %endif ;----------------------------------------------------------------------------- ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W4_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m1, [pw_pixel_max] pxor m0, m0 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movh m2, [r2] movhps m2, [r2 + r4] movh m3, [r3] movhps m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] movh m4, [r2] movhps m4, [r2 + r4] movh m5, [r3] movhps m5, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m2, m3 paddw m4, m5 CLIPW2 m2, m4, m0, m1 movh [r0], m2 movhps [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movhps [r0 + r1], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movh m1, [r3] movh m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movh m5, [r3] movh m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movd [r0], m0 movd [r0 + r1], m2 lea r0, [r0 + r1 * 2] movd [r0], m4 movd [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W4_H4 4, 8 PIXEL_ADD_PS_W4_H4 4, 16 ;----------------------------------------------------------------------------- ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W8_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + r1], m2 movu m0, [r2] movu m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] dec r6d lea r0, [r0 + r1 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + r1], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m2, [r2 + r4] movu m1, [r3] movu m3, [r3 + r5] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m4, [r2] pmovzxbw m6, [r2 + r4] movu m5, [r3] movu m7, [r3 + r5] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 paddw m4, m5 paddw m6, m7 packuswb m0, m0 packuswb m2, m2 packuswb m4, m4 packuswb m6, m6 movh [r0], m0 movh [r0 + r1], m2 lea r0, [r0 + r1 * 2] movh [r0], m4 movh [r0 + r1], m6 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W8_H4 8, 4 PIXEL_ADD_PS_W8_H4 8, 8 PIXEL_ADD_PS_W8_H4 8, 16 PIXEL_ADD_PS_W8_H4 8, 32 ;----------------------------------------------------------------------------- ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/4 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] lea r0, [r0 + r1 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/4 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m4, [r2 + r4] pmovzxbw m5, [r2 + r4 + 8] movu m2, [r3] movu m3, [r3 + 16] movu m6, [r3 + r5] movu m7, [r3 + r5 + 16] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m2 paddw m1, m3 paddw m4, m6 paddw m5, m7 packuswb m0, m1 packuswb m4, m5 movu [r0], m0 movu [r0 + r1], m4 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m4, [r2 + r4] pmovzxbw m5, [r2 + r4 + 8] movu m2, [r3] movu m3, [r3 + 16] movu m6, [r3 + r5] movu m7, [r3 + r5 + 16] dec r6d lea r0, [r0 + r1 * 2] lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m2 paddw m1, m3 paddw m4, m6 paddw m5, m7 packuswb m0, m1 packuswb m4, m5 movu [r0], m0 movu [r0 + r1], m4 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W16_H4 16, 4 PIXEL_ADD_PS_W16_H4 16, 8 PIXEL_ADD_PS_W16_H4 16, 12 PIXEL_ADD_PS_W16_H4 16, 16 PIXEL_ADD_PS_W16_H4 16, 32 PIXEL_ADD_PS_W16_H4 16, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_16x16(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W16_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_16x%1, 6, 10, 4, dest, destride, src0, scr1, srcStride0, srcStride1 mova m3, [pw_pixel_max] pxor m2, m2 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m1, [r3] paddw m0, m1 CLIPW m0, m2, m3 movu [r0], m0 movu m0, [r2 + r4] movu m1, [r3 + r5] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r1], m0 movu m0, [r2 + r4 * 2] movu m1, [r3 + r5 * 2] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r1 * 2], m0 movu m0, [r2 + r7] movu m1, [r3 + r8] paddw m0, m1 CLIPW m0, m2, m3 movu [r0 + r9], m0 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else INIT_YMM avx2 cglobal pixel_add_ps_16x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 .loop: pmovzxbw m0, [r2] ; row 0 of src0 pmovzxbw m1, [r2 + r4] ; row 1 of src0 movu m2, [r3] ; row 0 of src1 movu m3, [r3 + r5] ; row 1 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] pmovzxbw m2, [r2] ; row 2 of src0 pmovzxbw m3, [r2 + r4] ; row 3 of src0 movu m4, [r3] ; row 2 of src1 movu m5, [r3 + r5] ; row 3 of src1 paddw m2, m4 paddw m3, m5 packuswb m2, m3 lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] vpermq m0, m0, 11011000b movu [r0], xm0 ; row 0 of dst vextracti128 xm3, m0, 1 movu [r0 + r1], xm3 ; row 1 of dst lea r0, [r0 + r1 * 2] vpermq m2, m2, 11011000b movu [r0], xm2 ; row 2 of dst vextracti128 xm3, m2, 1 movu [r0 + r1], xm3 ; row 3 of dst lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W16_H4_avx2 4 PIXEL_ADD_PS_W16_H4_avx2 8 PIXEL_ADD_PS_W16_H4_avx2 12 PIXEL_ADD_PS_W16_H4_avx2 16 PIXEL_ADD_PS_W16_H4_avx2 32 PIXEL_ADD_PS_W16_H4_avx2 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + 32] movu m2, [r2 + 48] movu m1, [r3 + 32] movu m3, [r3 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 32], m0 movu [r0 + 48], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2 + r4 + 32] movu m2, [r2 + r4 + 48] movu m1, [r3 + r5 + 32] movu m3, [r3 + r5 + 48] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m2, [r2 + 16] pmovzxbw m3, [r2 + 24] movu m4, [r3] movu m5, [r3 + 16] movu m6, [r3 + 32] movu m7, [r3 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0], m0 movu [r0 + 16], m2 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 8] pmovzxbw m2, [r2 + r4 + 16] pmovzxbw m3, [r2 + r4 + 24] movu m4, [r3 + r5] movu m5, [r3 + r5 + 16] movu m6, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W32_H2 32, 8 PIXEL_ADD_PS_W32_H2 32, 16 PIXEL_ADD_PS_W32_H2 32, 24 PIXEL_ADD_PS_W32_H2 32, 32 PIXEL_ADD_PS_W32_H2 32, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W32_H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_32x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m2, [r2 + 32] movu m1, [r3] movu m3, [r3 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 32], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 32] movu m1, [r3 + r5] movu m3, [r3 + r5 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 32], m2 movu m0, [r2 + r4 * 2] movu m2, [r2 + r4 * 2 + 32] movu m1, [r3 + r5 * 2] movu m3, [r3 + r5 * 2 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m2 movu m0, [r2 + r7] movu m2, [r2 + r7 + 32] movu m1, [r3 + r8] movu m3, [r3 + r8 + 32] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r9], m0 movu [r0 + r9 + 32], m2 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_32x%1, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/4 add r5, r5 lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: pmovzxbw m0, [r2] ; first half of row 0 of src0 pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 movu m2, [r3] ; first half of row 0 of src1 movu m3, [r3 + 32] ; second half of row 0 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0], m0 ; row 0 of dst pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 movu m2, [r3 + r5] ; first half of row 1 of src1 movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r1], m0 ; row 1 of dst pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0 pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0 movu m2, [r3 + r5 * 2] ; first half of row 2 of src1 movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r1 * 2], m0 ; row 2 of dst pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0 pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0 movu m2, [r3 + r8] ; first half of row 3 of src1 movu m3, [r3 + r8 + 32] ; second half of row 3 of src1 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpermq m0, m0, 11011000b movu [r0 + r9], m0 ; row 3 of dst lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] lea r0, [r0 + r1 * 4] dec r6d jnz .loop RET %endif %endif %endmacro PIXEL_ADD_PS_W32_H4_avx2 8 PIXEL_ADD_PS_W32_H4_avx2 16 PIXEL_ADD_PS_W32_H4_avx2 24 PIXEL_ADD_PS_W32_H4_avx2 32 PIXEL_ADD_PS_W32_H4_avx2 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64_H2 2 %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %2/2 add r4, r4 add r5, r5 add r1, r1 .loop: movu m0, [r2] movu m2, [r2 + 16] movu m1, [r3] movu m3, [r3 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0], m0 movu [r0 + 16], m2 movu m0, [r2 + 32] movu m2, [r2 + 48] movu m1, [r3 + 32] movu m3, [r3 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 32], m0 movu [r0 + 48], m2 movu m0, [r2 + 64] movu m2, [r2 + 80] movu m1, [r3 + 64] movu m3, [r3 + 80] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 64], m0 movu [r0 + 80], m2 movu m0, [r2 + 96] movu m2, [r2 + 112] movu m1, [r3 + 96] movu m3, [r3 + 112] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + 96], m0 movu [r0 + 112], m2 movu m0, [r2 + r4] movu m2, [r2 + r4 + 16] movu m1, [r3 + r5] movu m3, [r3 + r5 + 16] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 movu m0, [r2 + r4 + 32] movu m2, [r2 + r4 + 48] movu m1, [r3 + r5 + 32] movu m3, [r3 + r5 + 48] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 movu m0, [r2 + r4 + 64] movu m2, [r2 + r4 + 80] movu m1, [r3 + r5 + 64] movu m3, [r3 + r5 + 80] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 64], m0 movu [r0 + r1 + 80], m2 movu m0, [r2 + r4 + 96] movu m2, [r2 + r4 + 112] movu m1, [r3 + r5 + 96] movu m3, [r3 + r5 + 112] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m1 paddw m2, m3 CLIPW2 m0, m2, m4, m5 movu [r0 + r1 + 96], m0 movu [r0 + r1 + 112], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %else INIT_XMM sse4 cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %2/2 add r5, r5 .loop: pmovzxbw m0, [r2] pmovzxbw m1, [r2 + 8] pmovzxbw m2, [r2 + 16] pmovzxbw m3, [r2 + 24] movu m4, [r3] movu m5, [r3 + 16] movu m6, [r3 + 32] movu m7, [r3 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0], m0 movu [r0 + 16], m2 pmovzxbw m0, [r2 + 32] pmovzxbw m1, [r2 + 40] pmovzxbw m2, [r2 + 48] pmovzxbw m3, [r2 + 56] movu m4, [r3 + 64] movu m5, [r3 + 80] movu m6, [r3 + 96] movu m7, [r3 + 112] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + 32], m0 movu [r0 + 48], m2 pmovzxbw m0, [r2 + r4] pmovzxbw m1, [r2 + r4 + 8] pmovzxbw m2, [r2 + r4 + 16] pmovzxbw m3, [r2 + r4 + 24] movu m4, [r3 + r5] movu m5, [r3 + r5 + 16] movu m6, [r3 + r5 + 32] movu m7, [r3 + r5 + 48] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1], m0 movu [r0 + r1 + 16], m2 pmovzxbw m0, [r2 + r4 + 32] pmovzxbw m1, [r2 + r4 + 40] pmovzxbw m2, [r2 + r4 + 48] pmovzxbw m3, [r2 + r4 + 56] movu m4, [r3 + r5 + 64] movu m5, [r3 + r5 + 80] movu m6, [r3 + r5 + 96] movu m7, [r3 + r5 + 112] dec r6d lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 movu [r0 + r1 + 32], m0 movu [r0 + r1 + 48], m2 lea r0, [r0 + r1 * 2] jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W64_H2 64, 16 PIXEL_ADD_PS_W64_H2 64, 32 PIXEL_ADD_PS_W64_H2 64, 48 PIXEL_ADD_PS_W64_H2 64, 64 ;----------------------------------------------------------------------------- ; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) ;----------------------------------------------------------------------------- %macro PIXEL_ADD_PS_W64H4_avx2 1 %if HIGH_BIT_DEPTH %if ARCH_X86_64 INIT_YMM avx2 cglobal pixel_add_ps_64x%1, 6, 10, 6, dest, destride, src0, scr1, srcStride0, srcStride1 mova m5, [pw_pixel_max] pxor m4, m4 mov r6d, %1/4 add r4d, r4d add r5d, r5d add r1d, r1d lea r7, [r4 * 3] lea r8, [r5 * 3] lea r9, [r1 * 3] .loop: movu m0, [r2] movu m1, [r2 + 32] movu m2, [r3] movu m3, [r3 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0], m0 movu [r0 + 32], m1 movu m0, [r2 + 64] movu m1, [r2 + 96] movu m2, [r3 + 64] movu m3, [r3 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + 64], m0 movu [r0 + 96], m1 movu m0, [r2 + r4] movu m1, [r2 + r4 + 32] movu m2, [r3 + r5] movu m3, [r3 + r5 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1], m0 movu [r0 + r1 + 32], m1 movu m0, [r2 + r4 + 64] movu m1, [r2 + r4 + 96] movu m2, [r3 + r5 + 64] movu m3, [r3 + r5 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 + 64], m0 movu [r0 + r1 + 96], m1 movu m0, [r2 + r4 * 2] movu m1, [r2 + r4 * 2 + 32] movu m2, [r3 + r5 * 2] movu m3, [r3 + r5 * 2+ 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 * 2], m0 movu [r0 + r1 * 2 + 32], m1 movu m0, [r2 + r4 * 2 + 64] movu m1, [r2 + r4 * 2 + 96] movu m2, [r3 + r5 * 2 + 64] movu m3, [r3 + r5 * 2 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r1 * 2 + 64], m0 movu [r0 + r1 * 2 + 96], m1 movu m0, [r2 + r7] movu m1, [r2 + r7 + 32] movu m2, [r3 + r8] movu m3, [r3 + r8 + 32] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r9], m0 movu [r0 + r9 + 32], m1 movu m0, [r2 + r7 + 64] movu m1, [r2 + r7 + 96] movu m2, [r3 + r8 + 64] movu m3, [r3 + r8 + 96] paddw m0, m2 paddw m1, m3 CLIPW2 m0, m1, m4, m5 movu [r0 + r9 + 64], m0 movu [r0 + r9 + 96], m1 dec r6d lea r0, [r0 + r1 * 4] lea r2, [r2 + r4 * 4] lea r3, [r3 + r5 * 4] jnz .loop RET %endif %else INIT_YMM avx2 cglobal pixel_add_ps_64x%1, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 mov r6d, %1/2 add r5, r5 .loop: pmovzxbw m0, [r2] ; first 16 of row 0 of src0 pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0 pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0 pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0 movu m4, [r3] ; first 16 of row 0 of src1 movu m5, [r3 + 32] ; second 16 of row 0 of src1 movu m6, [r3 + 64] ; third 16 of row 0 of src1 movu m7, [r3 + 96] ; forth 16 of row 0 of src1 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b movu [r0], m0 ; first 32 of row 0 of dst vpermq m2, m2, 11011000b movu [r0 + 32], m2 ; second 32 of row 0 of dst pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0 pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0 pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0 pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0 movu m4, [r3 + r5] ; first 16 of row 1 of src1 movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1 movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1 movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 packuswb m0, m1 packuswb m2, m3 vpermq m0, m0, 11011000b movu [r0 + r1], m0 ; first 32 of row 1 of dst vpermq m2, m2, 11011000b movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst lea r2, [r2 + r4 * 2] lea r3, [r3 + r5 * 2] lea r0, [r0 + r1 * 2] dec r6d jnz .loop RET %endif %endmacro PIXEL_ADD_PS_W64H4_avx2 16 PIXEL_ADD_PS_W64H4_avx2 32 PIXEL_ADD_PS_W64H4_avx2 48 PIXEL_ADD_PS_W64H4_avx2 64 davs2-1.6/source/common/x86/quant8.asm000066400000000000000000000075571337322544400175630ustar00rootroot00000000000000;***************************************************************************** ;* quant8.asm: x86 quantization functions ;***************************************************************************** ;* xavs2 - video encoder of AVS2/IEEE1857.4 video coding standard ;* Copyright (C) 2018~ VCL, NELVT, Peking University ;* ;* Authors: Falei LUO ;* Jiaqi Zhang ;* ;* Homepage1: http://vcl.idm.pku.edu.cn/xavs2 ;* Homepage2: https://github.com/pkuvcl/xavs2 ;* Homepage3: https://gitee.com/pkuvcl/xavs2 ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at sswang @ pku.edu.cn. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text ; ---------------------------------------------------------------------------- ; void dequant(coeff_t *coef, const int i_coef, const int scale, const int shift); ; ---------------------------------------------------------------------------- ; ---------------------------------------------------------------------------- ; dequant_sse4 INIT_XMM sse4 cglobal dequant, 2,2,7 ;{ mov r3, r3mp ; r3 <-- shift movd m4, r2mp ; m4[0] = scale movd m6, r3 ; m6[0] = shift dec r3 ; r3d <-- shift - 1 xor r2, r2 ; r2 <-- 0 shr r1, 4 ; r1 = i_coef/16 bts r2, r3 ; r2 <-- add = 1 < (shift - 1) movd m5, r2 ; m5[0] = add pshufd m4, m4, 0 ; m4[3210] = scale pshufd m5, m5, 0 ; m5[3210] = add ; .loop: ; pmovsxwd m0, [r0 ] ; load 4 coeff pmovsxwd m1, [r0 + 8] ; pmovsxwd m2, [r0 + 16] ; pmovsxwd m3, [r0 + 24] ; ; pmulld m0, m4 ; coef[i] * scale pmulld m1, m4 ; pmulld m2, m4 ; pmulld m3, m4 ; paddd m0, m5 ; coef[i] * scale + add paddd m1, m5 ; paddd m2, m5 ; paddd m3, m5 ; psrad m0, m6 ; (coef[i] * scale + add) >> shift psrad m1, m6 ; psrad m2, m6 ; psrad m3, m6 ; ; packssdw m0, m1 ; pack to 8 coeff packssdw m2, m3 ; ; mova [r0 ], m0 ; store mova [r0+16], m2 ; add r0, 32 ; dec r1 ; jnz .loop ; ; RET ; return ;} davs2-1.6/source/common/x86/x86inc.asm000066400000000000000000001257331337322544400174570ustar00rootroot00000000000000;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2014 x264 project ;* 2013-2014 x265 project ;* ;* Authors: Loren Merritt ;* Anton Mitrofanov ;* Fiona Glaser ;* Henrik Gramner ;* Min Chen ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x264ASM assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used in x264. ; Unlike the rest of x264, this file is available under an ISC license, as it ; has significant usefulness outside of x264 and we want it to be available ; to the largest audience possible. Of course, if you modify it for your own ; purposes to add a new feature, we strongly encourage contributing a patch ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . %ifndef private_prefix %define private_prefix davs2 %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN64 1 %else %define UNIX64 1 %endif %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif %macro SECTION_RODATA 0-1 32 SECTION .rodata align=%1 %endmacro %macro SECTION_TEXT 0-1 16 SECTION .text align=%1 %endmacro %if WIN64 %define PIC %elif ARCH_X86_64 == 0 ; x86_32 doesn't require PIC. ; Some distros prefer shared objects to be PIC, but nothing breaks if ; the code contains a few textrels, so we'll skip that complexity. %undef PIC %endif %ifdef PIC default rel %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most of x264's asm. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assert failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if mmsize != 8 %assign xmm_regs_used %2 %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 1 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 %assign regs_used (regs_used + 1) %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 %warning "Stack pointer will overwrite register argument" %endif %endif %endif %endmacro %macro DEFINE_ARGS_INTERNAL 3+ %ifnum %2 DEFINE_ARGS %3 %elif %1 == 4 DEFINE_ARGS %2 %elif %1 > 4 DEFINE_ARGS %2, %3 %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R12, 96 DECLARE_REG 12, R13, 104 DECLARE_REG 13, R14, 112 DECLARE_REG 14, R15, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if xmm_regs_used > 6 movaps [rstk + stack_offset + 8], xmm6 %endif %if xmm_regs_used > 7 movaps [rstk + stack_offset + 24], xmm7 %endif %if xmm_regs_used > 8 %assign %%i 8 %rep xmm_regs_used-8 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 8 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad (xmm_regs_used-8)*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 1 %assign %%pad_size 0 %if xmm_regs_used > 8 %assign %%i xmm_regs_used %rep xmm_regs_used-8 %assign %%i %%i-1 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add %1, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 movaps xmm7, [%1 + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 movaps xmm6, [%1 + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 1 WIN64_RESTORE_XMM_INTERNAL %1 %assign stack_offset (stack_offset-stack_size_padded) %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R12, 48 DECLARE_REG 12, R13, 56 DECLARE_REG 13, R14, 64 DECLARE_REG 14, R15, 72 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if mmsize == 32 vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro %macro WIN64_RESTORE_XMM 1 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue RET %else rep ret %endif %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %ifndef cpuflags times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. %elif notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep %endif ret %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %%branch_instr: %xdefine last_branch_adr %%branch_instr %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 2 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ %if %1 %xdefine %%FUNCTION_PREFIX private_prefix %xdefine %%VISIBILITY hidden %else %xdefine %%FUNCTION_PREFIX public_prefix %xdefine %%VISIBILITY %endif %ifndef cglobaled_%2 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %ifidn __OUTPUT_FORMAT__,elf global %2:function %%VISIBILITY %else global %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 %ifnidn %3, "" PROLOGUE %3 %endif %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro ; like cextern, but without the prefix %macro cextern_naked 1 %xdefine %1 mangle(%1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %ifidn __OUTPUT_FORMAT__,elf global %1:data hidden %else global %1 %endif ALIGN 32 %1: %2 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is ; executable by default. %ifidn __OUTPUT_FORMAT__,elf SECTION .note.GNU-stack noalloc noexec nowrite progbits %endif ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 %assign cpuflags_avx (1<<11)| cpuflags_sse42 %assign cpuflags_xop (1<<12)| cpuflags_avx %assign cpuflags_fma4 (1<<13)| cpuflags_avx %assign cpuflags_avx2 (1<<14)| cpuflags_avx %assign cpuflags_fma3 (1<<15)| cpuflags_avx %assign cpuflags_cache32 (1<<16) %assign cpuflags_cache64 (1<<17) %assign cpuflags_slowctz (1<<18) %assign cpuflags_lzcnt (1<<19) %assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<21) %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) CPU amdnop %else CPU basicnop %endif %endmacro ; Merge mmx and sse* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; (All 3 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define num_mmregs 8 %define mova movq %define movu movq %define movh movd %define movnta movntq %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i CAT_XDEFINE nmm, %%i, %%i %assign %%i %%i+1 %endrep %rep 8 CAT_UNDEF m, %%i CAT_UNDEF nmm, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define num_mmregs 8 %if ARCH_X86_64 %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i CAT_XDEFINE nxmm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define num_mmregs 8 %if ARCH_X86_64 %define num_mmregs 16 %endif %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, ymm %+ %%i CAT_XDEFINE nymm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymm%1xmm xmm%1 %define xmm%1ymm ymm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %endmacro %assign i 0 %rep 16 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE n, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE n, m%1, %1 CAT_XDEFINE n, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args n %+ %1 %rep %0-1 %xdefine %%args %%args, n %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE %%f, %%i, m %+ %%i %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 1 ; name to load from %ifdef %1_m0 %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i CAT_XDEFINE n, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 call_internal %1, %1 %+ SUFFIX %endmacro %macro call_internal 2 %xdefine %%i %1 %ifndef cglobaled_%1 %ifdef cglobaled_%2 %xdefine %%i %2 %endif %endif call %%i LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 16 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %ifnidn %6, %7 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 %endif %if %5 && %4 == 0 %ifnid %8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 __instr %6, %7, %8 %elif %0 == 7 __instr %6, %7 %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 1, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX and non-VEX encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 1 AVX_INSTR addss, sse, 1, 0, 1 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, fnord, 0, 0, 0 AVX_INSTR aesdeclast, fnord, 0, 0, 0 AVX_INSTR aesenc, fnord, 0, 0, 0 AVX_INSTR aesenclast, fnord, 0, 0, 0 AVX_INSTR aesimc AVX_INSTR aeskeygenassist AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 0, 0 AVX_INSTR blendps, sse4, 1, 0, 0 AVX_INSTR blendvpd, sse4, 1, 0, 0 AVX_INSTR blendvps, sse4, 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR comisd, sse2 AVX_INSTR comiss, sse AVX_INSTR cvtdq2pd, sse2 AVX_INSTR cvtdq2ps, sse2 AVX_INSTR cvtpd2dq, sse2 AVX_INSTR cvtpd2ps, sse2 AVX_INSTR cvtps2dq, sse2 AVX_INSTR cvtps2pd, sse2 AVX_INSTR cvtsd2si, sse2 AVX_INSTR cvtsd2ss, sse2 AVX_INSTR cvtsi2sd, sse2 AVX_INSTR cvtsi2ss, sse AVX_INSTR cvtss2sd, sse2 AVX_INSTR cvtss2si, sse AVX_INSTR cvttpd2dq, sse2 AVX_INSTR cvttps2dq, sse2 AVX_INSTR cvttsd2si, sse2 AVX_INSTR cvttss2si, sse AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 1 AVX_INSTR maxss, sse, 1, 0, 1 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 1 AVX_INSTR minss, sse, 1, 0, 1 AVX_INSTR movapd, sse2 AVX_INSTR movaps, sse AVX_INSTR movd AVX_INSTR movddup, sse3 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2 AVX_INSTR movmskps, sse AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2 AVX_INSTR movntps, sse AVX_INSTR movq AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3 AVX_INSTR movsldup, sse3 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2 AVX_INSTR movups, sse AVX_INSTR mpsadbw, sse4 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 1 AVX_INSTR mulss, sse, 1, 0, 1 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 0, 0 AVX_INSTR pblendw, sse4 AVX_INSTR pclmulqdq AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4 AVX_INSTR pinsrd, sse4 AVX_INSTR pinsrq, sse4 AVX_INSTR pinsrw, mmx2 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1, 0, 0 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4 AVX_INSTR roundps, sse4 AVX_INSTR roundsd, sse4 AVX_INSTR roundss, sse4 AVX_INSTR rsqrtps, sse, 1, 0, 0 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1, 0, 0 AVX_INSTR sqrtps, sse, 1, 0, 0 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2 AVX_INSTR ucomiss, sse AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 AVX_INSTR pfmul, 3dnow, 1, 0, 1 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmadcswd, pmaddwd, paddd ; convert FMA4 to FMA3 if possible %macro FMA4_INSTR 4 %macro %1 4-8 %1, %2, %3, %4 %if cpuflag(fma4) v%5 %1, %2, %3, %4 %elifidn %1, %2 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 %elifidn %1, %3 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 %elifidn %1, %4 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 %else %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug %if ARCH_X86_64 == 0 %macro vpbroadcastq 2 %if sizeof%1 == 16 movddup %1, %2 %else vbroadcastsd %1, %2 %endif %endmacro %endif ; workaround: vpbroadcastd with register, the yasm will generate wrong code %macro vpbroadcastd 2 %ifid %2 movd %1 %+ xmm, %2 vpbroadcastd %1, %1 %+ xmm %else vpbroadcastd %1, %2 %endif %endmacro davs2-1.6/source/common/x86/x86util.asm000066400000000000000000000513661337322544400176630ustar00rootroot00000000000000;***************************************************************************** ;* x86util.asm: x86 utility macros ;***************************************************************************** ;* Copyright (C) 2008-2013 x264 project ;* ;* Authors: Holger Lubitz ;* Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at license @ x265.com. ;***************************************************************************** %assign FENC_STRIDE 64 %assign FDEC_STRIDE 32 %assign SIZEOF_PIXEL 1 %assign SIZEOF_DCTCOEF 2 %define pixel byte %define vpbroadcastdct vpbroadcastw %define vpbroadcastpix vpbroadcastb %if HIGH_BIT_DEPTH %assign SIZEOF_PIXEL 2 %assign SIZEOF_DCTCOEF 4 %define pixel word %define vpbroadcastdct vpbroadcastd %define vpbroadcastpix vpbroadcastw %endif %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE %assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE %assign PIXEL_MAX ((1 << BIT_DEPTH)-1) %macro FIX_STRIDES 1-* %if HIGH_BIT_DEPTH %rep %0 add %1, %1 %rotate 1 %endrep %endif %endmacro %macro SBUTTERFLY 4 %ifidn %1, dqqq vperm2i128 m%4, m%2, m%3, q0301 ; punpckh vinserti128 m%2, m%2, xm%3, 1 ; punpckl %elif avx_enabled && mmsize >= 16 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 %else mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 %endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 punpckl%1 m%4, m%2, m%3 punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE2x4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SBUTTERFLY qdq, %1, %2, %5 SBUTTERFLY qdq, %3, %4, %5 %endmacro %macro TRANSPOSE4x4D 5 SBUTTERFLY dq, %1, %2, %5 SBUTTERFLY dq, %3, %4, %5 SBUTTERFLY qdq, %1, %3, %5 SBUTTERFLY qdq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE8x8W 9-11 %if ARCH_X86_64 SBUTTERFLY wd, %1, %2, %9 SBUTTERFLY wd, %3, %4, %9 SBUTTERFLY wd, %5, %6, %9 SBUTTERFLY wd, %7, %8, %9 SBUTTERFLY dq, %1, %3, %9 SBUTTERFLY dq, %2, %4, %9 SBUTTERFLY dq, %5, %7, %9 SBUTTERFLY dq, %6, %8, %9 SBUTTERFLY qdq, %1, %5, %9 SBUTTERFLY qdq, %2, %6, %9 SBUTTERFLY qdq, %3, %7, %9 SBUTTERFLY qdq, %4, %8, %9 SWAP %2, %5 SWAP %4, %7 %else ; in: m0..m7, unless %11 in which case m6 is in %9 ; out: m0..m7, unless %11 in which case m4 is in %10 ; spills into %9 and %10 %if %0<11 movdqa %9, m%7 %endif SBUTTERFLY wd, %1, %2, %7 movdqa %10, m%2 movdqa m%7, %9 SBUTTERFLY wd, %3, %4, %2 SBUTTERFLY wd, %5, %6, %2 SBUTTERFLY wd, %7, %8, %2 SBUTTERFLY dq, %1, %3, %2 movdqa %9, m%3 movdqa m%2, %10 SBUTTERFLY dq, %2, %4, %3 SBUTTERFLY dq, %5, %7, %3 SBUTTERFLY dq, %6, %8, %3 SBUTTERFLY qdq, %1, %5, %3 SBUTTERFLY qdq, %2, %6, %3 movdqa %10, m%2 movdqa m%3, %9 SBUTTERFLY qdq, %3, %7, %2 SBUTTERFLY qdq, %4, %8, %2 SWAP %2, %5 SWAP %4, %7 %if %0<11 movdqa m%5, %10 %endif %endif %endmacro %macro WIDEN_SXWD 2 punpckhwd m%2, m%1 psrad m%2, 16 %if cpuflag(sse4) pmovsxwd m%1, m%1 %else punpcklwd m%1, m%1 psrad m%1, 16 %endif %endmacro %macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src) %if cpuflag(ssse3) pabsw %1, %2 %elifidn %3, sign ; version for pairing with PSIGNW: modifies src pxor %1, %1 pcmpgtw %1, %2 pxor %2, %1 psubw %2, %1 SWAP %1, %2 %elifidn %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %elifid %2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %elif %0 == 2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %else mova %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %endif %endmacro %macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp %if cpuflag(ssse3) pabsw %1, %3 pabsw %2, %4 %elifidn %1, %3 pxor %5, %5 pxor %6, %6 psubw %5, %1 psubw %6, %2 pmaxsw %1, %5 pmaxsw %2, %6 %else pxor %1, %1 pxor %2, %2 psubw %1, %3 psubw %2, %4 pmaxsw %1, %3 pmaxsw %2, %4 %endif %endmacro %macro ABSB 2 %if cpuflag(ssse3) pabsb %1, %1 %else pxor %2, %2 psubb %2, %1 pminub %1, %2 %endif %endmacro %macro ABSD 2-3 %if cpuflag(ssse3) pabsd %1, %2 %else %define %%s %2 %if %0 == 3 mova %3, %2 %define %%s %3 %endif pxor %1, %1 pcmpgtd %1, %%s pxor %%s, %1 psubd %%s, %1 SWAP %1, %%s %endif %endmacro %macro PSIGN 3-4 %if cpuflag(ssse3) && %0 == 4 psign%1 %2, %3, %4 %elif cpuflag(ssse3) psign%1 %2, %3 %elif %0 == 4 pxor %2, %3, %4 psub%1 %2, %4 %else pxor %2, %3 psub%1 %2, %3 %endif %endmacro %define PSIGNW PSIGN w, %define PSIGND PSIGN d, %macro SPLATB_LOAD 3 %if cpuflag(ssse3) movd %1, [%2-3] pshufb %1, %3 %else movd %1, [%2-3] ;to avoid crossing a cacheline punpcklbw %1, %1 SPLATW %1, %1, 3 %endif %endmacro %imacro SPLATW 2-3 0 %if cpuflag(avx2) && %3 == 0 vpbroadcastw %1, %2 %else PSHUFLW %1, %2, (%3)*q1111 %if mmsize == 16 punpcklqdq %1, %1 %endif %endif %endmacro %imacro SPLATD 2-3 0 %if mmsize == 16 pshufd %1, %2, (%3)*q1111 %else pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010 %endif %endmacro %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 %endmacro %macro CLIPW2 4 ;(dst0, dst1, min, max) pmaxsw %1, %3 pmaxsw %2, %3 pminsw %1, %4 pminsw %2, %4 %endmacro %macro HADDD 2 ; sum junk %if sizeof%1 == 32 %define %2 xmm%2 vextracti128 %2, %1, 1 %define %1 xmm%1 paddd %1, %2 %endif %if mmsize >= 16 %if cpuflag(xop) && sizeof%1 == 16 vphadddq %1, %1 %endif movhlps %2, %1 paddd %1, %2 %endif %if notcpuflag(xop) PSHUFLW %2, %1, q0032 paddd %1, %2 %endif %undef %1 %undef %2 %endmacro %macro HADDW 2 ; reg, tmp %if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 movhlps %2, %1 paddd %1, %2 %else pmaddwd %1, [pw_1] HADDD %1, %2 %endif %endmacro %macro HADDUWD 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwd %1, %1 %else psrld %2, %1, 16 pslld %1, 16 psrld %1, 16 paddd %1, %2 %endif %endmacro %macro HADDUW 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwq %1, %1 movhlps %2, %1 paddd %1, %2 %else HADDUWD %1, %2 HADDD %1, %2 %endif %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp ; AVX2 version uses a precalculated extra input that ; can be re-used across calls %if sizeof%1==32 ; %3 = abcdefgh ijklmnop (lower address) ; %2 = ABCDEFGH IJKLMNOP (higher address) ; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH %if %4 < 16 palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA %else palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO %endif %elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else palignr %1, %2, %3 %endif %else %define %%dst %1 %if %0==5 %ifnidn %1, %2 mova %%dst, %2 %endif %rotate 1 %endif %ifnidn %4, %2 mova %4, %2 %endif %if mmsize==8 psllq %%dst, (8-%3)*8 psrlq %4, %3*8 %else pslldq %%dst, 16-%3 psrldq %4, %3 %endif por %%dst, %4 %endif %endmacro %macro PSHUFLW 1+ %if mmsize == 8 pshufw %1 %else pshuflw %1 %endif %endmacro ; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes ; values shifted in are undefined ; faster if dst==src %define PSLLPIX PSXLPIX l, -1, ;dst, src, shift %define PSRLPIX PSXLPIX r, 1, ;dst, src, shift %macro PSXLPIX 5 %if mmsize == 8 %if %5&1 ps%1lq %3, %4, %5*8 %else pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff %endif %else ps%1ldq %3, %4, %5*2 %endif %endmacro %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from %ifnum %5 pand m%3, m%5, m%4 ; src .. y6 .. y4 pand m%1, m%5, m%2 ; dst .. y6 .. y4 %else mova m%1, %5 pand m%3, m%1, m%4 ; src .. y6 .. y4 pand m%1, m%1, m%2 ; dst .. y6 .. y4 %endif psrlw m%2, 8 ; dst .. y7 .. y5 psrlw m%4, 8 ; src .. y7 .. y5 %endmacro %macro SUMSUB_BA 3-4 %if %0==3 padd%1 m%2, m%3 padd%1 m%3, m%3 psub%1 m%3, m%2 %elif avx_enabled padd%1 m%4, m%2, m%3 psub%1 m%3, m%2 SWAP %2, %4 %else mova m%4, m%2 padd%1 m%2, m%3 psub%1 m%3, m%4 %endif %endmacro %macro SUMSUB_BADC 5-6 %if %0==6 SUMSUB_BA %1, %2, %3, %6 SUMSUB_BA %1, %4, %5, %6 %else padd%1 m%2, m%3 padd%1 m%4, m%5 padd%1 m%3, m%3 padd%1 m%5, m%5 psub%1 m%3, m%2 psub%1 m%5, m%4 %endif %endmacro %macro HADAMARD4_V 4+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %1, %3, %2, %4 %endmacro %macro HADAMARD8_V 8+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %5, %6, %7, %8 SUMSUB_BADC w, %1, %3, %2, %4 SUMSUB_BADC w, %5, %7, %6, %8 SUMSUB_BADC w, %1, %5, %2, %6 SUMSUB_BADC w, %3, %7, %4, %8 %endmacro %macro TRANS_SSE2 5-6 ; TRANSPOSE2x2 ; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq ; %2: ord/unord (for compat with sse4, unused) ; %3/%4: source regs ; %5/%6: tmp regs %ifidn %1, d %define mask [mask_10] %define shift 16 %elifidn %1, q %define mask [mask_1100] %define shift 32 %endif %if %0==6 ; less dependency if we have two tmp mova m%5, mask ; ff00 mova m%6, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pand m%6, m%5 ; x5.. pandn m%5, m%3 ; ..x0 psrl%1 m%3, shift ; ..x1 por m%4, m%5 ; x4x0 por m%3, m%6 ; x5x1 %else ; more dependency, one insn less. sometimes faster, sometimes not mova m%5, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pxor m%4, m%3 ; (x4^x1)x0 pand m%4, mask ; (x4^x1).. pxor m%3, m%4 ; x4x0 psrl%1 m%4, shift ; ..(x1^x4) pxor m%5, m%4 ; x5x1 SWAP %4, %3, %5 %endif %endmacro %macro TRANS_SSE4 5-6 ; see above %ifidn %1, d %ifidn %2, ord psrl%1 m%5, m%3, 16 pblendw m%5, m%4, q2222 psll%1 m%4, 16 pblendw m%4, m%3, q1111 SWAP %3, %5 %else %if avx_enabled pblendw m%5, m%3, m%4, q2222 SWAP %3, %5 %else mova m%5, m%3 pblendw m%3, m%4, q2222 %endif psll%1 m%4, 16 psrl%1 m%5, 16 por m%4, m%5 %endif %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%3, m%4, q2020 SWAP %4, %5 %endif %endmacro %macro TRANS_XOP 5-6 %ifidn %1, d vpperm m%5, m%3, m%4, [transd_shuf1] vpperm m%3, m%3, m%4, [transd_shuf2] %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%4, q2020 %endif SWAP %4, %5 %endmacro %macro HADAMARD 5-6 ; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) ; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) ; %3/%4: regs ; %5(%6): tmpregs %if %1!=0 ; have to reorder stuff for horizontal op %ifidn %2, sumsub %define ORDER ord ; sumsub needs order because a-b != b-a unless a=b %else %define ORDER unord ; if we just max, order doesn't matter (allows pblendw+or in sse4) %endif %if %1==1 TRANS d, ORDER, %3, %4, %5, %6 %elif %1==2 %if mmsize==8 SBUTTERFLY dq, %3, %4, %5 %else TRANS q, ORDER, %3, %4, %5, %6 %endif %elif %1==4 SBUTTERFLY qdq, %3, %4, %5 %elif %1==8 SBUTTERFLY dqqq, %3, %4, %5 %endif %endif %ifidn %2, sumsub SUMSUB_BA w, %3, %4, %5 %else %ifidn %2, amax %if %0==6 ABSW2 m%3, m%4, m%3, m%4, m%5, m%6 %else ABSW m%3, m%3, m%5 ABSW m%4, m%4, m%5 %endif %endif pmaxsw m%3, m%4 %endif %endmacro %macro HADAMARD2_2D 6-7 sumsub HADAMARD 0, sumsub, %1, %2, %5 HADAMARD 0, sumsub, %3, %4, %5 SBUTTERFLY %6, %1, %2, %5 %ifnum %7 HADAMARD 0, amax, %1, %2, %5, %7 %else HADAMARD 0, %7, %1, %2, %5 %endif SBUTTERFLY %6, %3, %4, %5 %ifnum %7 HADAMARD 0, amax, %3, %4, %5, %7 %else HADAMARD 0, %7, %3, %4, %5 %endif %endmacro %macro HADAMARD4_2D 5-6 sumsub HADAMARD2_2D %1, %2, %3, %4, %5, wd HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 SWAP %2, %3 %endmacro %macro HADAMARD4_2D_SSE 5-6 sumsub HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 HADAMARD2_2D %1, %3, %2, %4, %5, dq SBUTTERFLY qdq, %1, %2, %5 HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 SBUTTERFLY qdq, %3, %4, %5 HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 %endmacro %macro HADAMARD8_2D 9-10 sumsub HADAMARD2_2D %1, %2, %3, %4, %9, wd HADAMARD2_2D %5, %6, %7, %8, %9, wd HADAMARD2_2D %1, %3, %2, %4, %9, dq HADAMARD2_2D %5, %7, %6, %8, %9, dq HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 %ifnidn %10, amax SWAP %2, %5 SWAP %4, %7 %endif %endmacro ; doesn't include the "pmaddubsw hmul_8p" pass %macro HADAMARD8_2D_HMUL 10 HADAMARD4_V %1, %2, %3, %4, %9 HADAMARD4_V %5, %6, %7, %8, %9 SUMSUB_BADC w, %1, %5, %2, %6, %9 HADAMARD 2, sumsub, %1, %5, %9, %10 HADAMARD 2, sumsub, %2, %6, %9, %10 SUMSUB_BADC w, %3, %7, %4, %8, %9 HADAMARD 2, sumsub, %3, %7, %9, %10 HADAMARD 2, sumsub, %4, %8, %9, %10 HADAMARD 1, amax, %1, %5, %9, %10 HADAMARD 1, amax, %2, %6, %9, %5 HADAMARD 1, amax, %3, %7, %9, %5 HADAMARD 1, amax, %4, %8, %9, %5 %endmacro %macro SUMSUB2_AB 4 %if cpuflag(xop) pmacs%1%1 m%4, m%3, [p%1_m2], m%2 pmacs%1%1 m%2, m%2, [p%1_2], m%3 %elifnum %3 psub%1 m%4, m%2, m%3 psub%1 m%4, m%3 padd%1 m%2, m%2 padd%1 m%2, m%3 %else mova m%4, m%2 padd%1 m%2, m%2 padd%1 m%2, %3 psub%1 m%4, %3 psub%1 m%4, %3 %endif %endmacro %macro SUMSUBD2_AB 5 %ifnum %4 psra%1 m%5, m%2, 1 ; %3: %3>>1 psra%1 m%4, m%3, 1 ; %2: %2>>1 padd%1 m%4, m%2 ; %3: %3>>1+%2 psub%1 m%5, m%3 ; %2: %2>>1-%3 SWAP %2, %5 SWAP %3, %4 %else mova %5, m%2 mova %4, m%3 psra%1 m%3, 1 ; %3: %3>>1 psra%1 m%2, 1 ; %2: %2>>1 padd%1 m%3, %5 ; %3: %3>>1+%2 psub%1 m%2, %4 ; %2: %2>>1-%3 %endif %endmacro %macro DCT4_1D 5 %ifnum %5 SUMSUB_BADC w, %4, %1, %3, %2, %5 SUMSUB_BA w, %3, %4, %5 SUMSUB2_AB w, %1, %2, %5 SWAP %1, %3, %4, %5, %2 %else SUMSUB_BADC w, %4, %1, %3, %2 SUMSUB_BA w, %3, %4 mova [%5], m%2 SUMSUB2_AB w, %1, [%5], %2 SWAP %1, %3, %4, %2 %endif %endmacro %macro IDCT4_1D 6-7 %ifnum %6 SUMSUBD2_AB %1, %3, %5, %7, %6 ; %3: %3>>1-%5 %5: %3+%5>>1 SUMSUB_BA %1, %4, %2, %7 ; %4: %2+%4 %2: %2-%4 SUMSUB_BADC %1, %5, %4, %3, %2, %7 ; %5: %2+%4 + (%3+%5>>1) ; %4: %2+%4 - (%3+%5>>1) ; %3: %2-%4 + (%3>>1-%5) ; %2: %2-%4 - (%3>>1-%5) %else %ifidn %1, w SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] %else SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] %endif SUMSUB_BA %1, %4, %2 SUMSUB_BADC %1, %5, %4, %3, %2 %endif SWAP %2, %5, %4 ; %2: %2+%4 + (%3+%5>>1) row0 ; %3: %2-%4 + (%3>>1-%5) row1 ; %4: %2-%4 - (%3>>1-%5) row2 ; %5: %2+%4 - (%3+%5>>1) row3 %endmacro %macro LOAD_DIFF 5-6 1 %if HIGH_BIT_DEPTH %if %6 ; %5 aligned? mova %1, %4 psubw %1, %5 %else movu %1, %4 movu %2, %5 psubw %1, %2 %endif %else ; !HIGH_BIT_DEPTH %ifidn %3, none movh %1, %4 movh %2, %5 punpcklbw %1, %2 punpcklbw %2, %2 psubw %1, %2 %else movh %1, %4 punpcklbw %1, %3 movh %2, %5 punpcklbw %2, %3 psubw %1, %2 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr %if BIT_DEPTH == 8 && cpuflag(ssse3) movh m%2, [%8+%1*FDEC_STRIDE] movh m%1, [%7+%1*FENC_STRIDE] punpcklbw m%1, m%2 movh m%3, [%8+%2*FDEC_STRIDE] movh m%2, [%7+%2*FENC_STRIDE] punpcklbw m%2, m%3 movh m%4, [%8+%3*FDEC_STRIDE] movh m%3, [%7+%3*FENC_STRIDE] punpcklbw m%3, m%4 movh m%5, [%8+%4*FDEC_STRIDE] movh m%4, [%7+%4*FENC_STRIDE] punpcklbw m%4, m%5 pmaddubsw m%1, m%6 pmaddubsw m%2, m%6 pmaddubsw m%3, m%6 pmaddubsw m%4, m%6 %else LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB] LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB] LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB] LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB] %endif %endmacro %macro STORE_DCT 6 movq [%5+%6+ 0], m%1 movq [%5+%6+ 8], m%2 movq [%5+%6+16], m%3 movq [%5+%6+24], m%4 movhps [%5+%6+32], m%1 movhps [%5+%6+40], m%2 movhps [%5+%6+48], m%3 movhps [%5+%6+56], m%4 %endmacro %macro STORE_IDCT 4 movhps [r0-4*FDEC_STRIDE], %1 movh [r0-3*FDEC_STRIDE], %1 movhps [r0-2*FDEC_STRIDE], %2 movh [r0-1*FDEC_STRIDE], %2 movhps [r0+0*FDEC_STRIDE], %3 movh [r0+1*FDEC_STRIDE], %3 movhps [r0+2*FDEC_STRIDE], %4 movh [r0+3*FDEC_STRIDE], %4 %endmacro %macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned? LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11 LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11 LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11 LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; 2xdst, 2xtmp, 2xsrcrow %macro LOAD_DIFF16x2_AVX2 6 pmovzxbw m%1, [r1+%5*FENC_STRIDE] pmovzxbw m%2, [r1+%6*FENC_STRIDE] pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE] pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE] psubw m%1, m%3 psubw m%2, m%4 %endmacro %macro DIFFx2 6-7 movh %3, %5 punpcklbw %3, %4 psraw %1, 6 paddsw %1, %3 movh %3, %6 punpcklbw %3, %4 psraw %2, 6 paddsw %2, %3 packuswb %2, %1 %endmacro ; (high depth) in: %1, %2, min to clip, max to clip, mem128 ; in: %1, tmp, %3, mem64 %macro STORE_DIFF 4-5 %if HIGH_BIT_DEPTH psrad %1, 6 psrad %2, 6 packssdw %1, %2 paddw %1, %5 CLIPW %1, %3, %4 mova %5, %1 %else movh %2, %4 punpcklbw %2, %3 psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1 %endif %endmacro %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 db %1, %1 %else db %1*2 db %1*2+1 %endif %rotate 1 %endrep %endmacro ; instruction, accum, input, iteration (zero to swap, nonzero to add) %macro ACCUM 4 %if %4 %1 m%2, m%3 %else SWAP %2, %3 %endif %endmacro ; IACA support %macro IACA_START 0 mov ebx, 111 db 0x64, 0x67, 0x90 %endmacro %macro IACA_END 0 mov ebx, 222 db 0x64, 0x67, 0x90 %endmacro davs2-1.6/source/configw.h000066400000000000000000000055141337322544400155200ustar00rootroot00000000000000/* * configw.h * * Description of this file: * header file for MS/Intel compiler on windows platform of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_CONFIGW_H #define DAVS2_CONFIGW_H #if defined(__ICL) || defined(_MSC_VER) /* arch */ #define ARCH_X86 1 #define ARCH_PPC 0 #define ARCH_ARM 0 #define ARCH_UltraSPARC 0 /* system */ #define SYS_WINDOWS 1 #define SYS_LINUX 0 #define SYS_MACOSX 0 #define SYS_BEOS 0 #define SYS_FREEBSD 0 #define SYS_OPENBSD 0 /* cpu */ #ifndef __SSE__ #define __SSE__ #endif #define HAVE_MMX 1 /* X86 */ #define HAVE_ALTIVEC 0 /* ALTIVEC */ #define HAVE_ALTIVEC_H 0 #define HAVE_NEON 0 /* ARM */ #define HAVE_ARMV6 0 #define HAVE_ARMV6T2 0 /* thread */ #define HAVE_THREAD 1 #define HAVE_WIN32THREAD 1 #define HAVE_PTHREAD 0 #define HAVE_BEOSTHREAD 0 #define HAVE_POSIXTHREAD 0 #define PTW32_STATIC_LIB 0 /* interlace support */ #define HAVE_INTERLACED 1 /* malloc */ #define HAVE_MALLOC_H 0 /* big-endian */ #define WORDS_BIGENDIAN 0 /* others */ #define HAVE_STDINT_H 1 #define HAVE_VECTOREXT 0 #define HAVE_LOG2F 0 #define HAVE_SWSCALE 0 #define HAVE_LAVF 0 #define HAVE_FFMS 0 #define HAVE_GPAC 0 #define HAVE_GF_MALLOC 0 #define HAVE_AVS 0 #endif #endif // DAVS2_CONFIGW_H davs2-1.6/source/davs2.h000066400000000000000000000256431337322544400151100ustar00rootroot00000000000000/* * davs2.h * * Description of this file: * API functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_DAVS2_H #define DAVS2_DAVS2_H #include #ifdef __cplusplus extern "C" { // only need to export C interface if used by C++ source code #endif /* dAVS2 build version, means different API interface * (10 * VER_MAJOR + VER_MINOR) */ #define DAVS2_BUILD 16 /** * =========================================================================== * define DAVS2_API * =========================================================================== */ #ifdef DAVS2_EXPORTS # ifdef __GNUC__ /* for Linux */ # if __GNUC__ >= 4 # define DAVS2_API __attribute__((visibility("default"))) # else # define DAVS2_API __attribute__((dllexport)) # endif # else /* for windows */ # define DAVS2_API __declspec(dllexport) # endif #else # ifdef __GNUC__ /* for Linux */ # define DAVS2_API # else /* for windows */ # define DAVS2_API __declspec(dllimport) # endif #endif /** * =========================================================================== * const defines * =========================================================================== */ /* --------------------------------------------------------------------------- * picture type */ enum davs2_picture_type_e { DAVS2_PIC_I = 0, /* picture-I */ DAVS2_PIC_P = 1, /* picture-P */ DAVS2_PIC_B = 2, /* picture-B */ DAVS2_PIC_G = 3, /* picture-G */ DAVS2_PIC_F = 4, /* picture-F */ DAVS2_PIC_S = 5 /* picture-S */ }; /* --------------------------------------------------------------------------- * profile id */ enum davs2_profile_id_e { DAVS2_PROFILE_MAIN_PIC = 0x12, /* AVS2 main picture profile */ DAVS2_PROFILE_MAIN = 0x20, /* AVS2 main profile */ DAVS2_PROFILE_MAIN10 = 0x22 /* AVS2 main 10bit profile */ }; /* --------------------------------------------------------------------------- * log level */ enum davs2_log_level_e { DAVS2_LOG_DEBUG = 0, DAVS2_LOG_INFO = 1, DAVS2_LOG_WARNING = 2, DAVS2_LOG_ERROR = 3, DAVS2_LOG_MAX = 4 }; /* --------------------------------------------------------------------------- * information of return value for decode/flush() */ enum davs2_ret_e { DAVS2_ERROR = -1, /* Decoding error occurs */ DAVS2_DEFAULT = 0, /* Decoding but no output */ DAVS2_GOT_FRAME = 1, /* Decoding get frame */ DAVS2_GOT_HEADER = 2, /* Decoding get sequence header, always obtained before DAVS2_GOT_FRAME */ DAVS2_END = 3, /* Decoding ended: no more bit-stream to decode and no more frames to output */ }; /** * =========================================================================== * interface struct type defines * =========================================================================== */ /* --------------------------------------------------------------------------- * information of sequence header */ typedef struct davs2_seq_info_t { uint32_t profile_id; /* profile ID, davs2_profile_id_e */ uint32_t level_id; /* level ID */ uint32_t progressive; /* progressive sequence (0: interlace, 1: progressive) */ uint32_t width; /* image width */ uint32_t height; /* image height */ uint32_t chroma_format; /* chroma format(1: 4:2:0, 2: 4:2:2) */ uint32_t aspect_ratio; /* 2: 4:3, 3: 16:9 */ uint32_t low_delay; /* low delay */ uint32_t bitrate; /* bitrate (bps) */ uint32_t internal_bit_depth; /* internal sample bit depth */ uint32_t output_bit_depth; /* output sample bit depth */ uint32_t bytes_per_sample; /* bytes per sample */ float frame_rate; /* frame rate */ uint32_t frame_rate_id; /* frame rate code, mpeg12 [1...8] */ } davs2_seq_info_t; /* --------------------------------------------------------------------------- * packet of bitstream */ typedef struct davs2_packet_t { const uint8_t *data; /* bitstream */ int len; /* bytes of the bitstream */ int64_t pts; /* presentation time stamp */ int64_t dts; /* decoding time stamp */ } davs2_packet_t; /* --------------------------------------------------------------------------- * decoded picture */ typedef struct davs2_picture_t { void *magic; /* must be the 1st member variable (do not change it) */ /* picture information */ uint8_t *planes[3]; /* picture planes */ int widths[3]; /* picture width in pixels */ int lines[3]; /* picture height in pixels */ int strides[3]; /* number of bytes in one line are stored continuously in memory */ int pic_order_count; /* picture number */ int type; /* picture type of the corresponding frame */ int qp; /* QP of the corresponding picture */ int64_t pts; /* presentation time stamp */ int64_t dts; /* decoding time stamp */ int num_planes; /* number of planes */ int bytes_per_sample; /* number of bytes for each sample */ int bit_depth; /* number of bytes for each sample */ int b_decode_error; /* is there any decoding error of this frame? */ void *dec_frame; /* pointer to decoding frame in DPB (do not change it) */ } davs2_picture_t; /* --------------------------------------------------------------------------- * parameters for create an AVS2 decoder */ typedef struct davs2_param_t { int threads; /* decoding threads: 0 for auto */ int info_level; /* only output information which is no less then this level (davs2_log_level_e). 0: All; 1: no debug info; 2: only warning and errors; 3: only errors */ void *opaque; /* user data */ /* additional parameters for version >= 16 */ int disable_avx; /* 1: disable; 0: default (autodetect) */ } davs2_param_t; /** * =========================================================================== * interface function declares (DAVS2 library APIs for AVS2 video decoder) * =========================================================================== */ /** * --------------------------------------------------------------------------- * Function : open an AVS2 decoder * Parameters : * [in/out] : param - pointer to struct davs2_param_t * Return : handle of the decoder, zero for failure * --------------------------------------------------------------------------- */ DAVS2_API void * davs2_decoder_open(davs2_param_t *param); /** * --------------------------------------------------------------------------- * Function : decode one frame * Parameters : * [in] : decoder - pointer to the AVS2 decoder handler * [in] : packet - pointer to struct davs2_packet_t * Return : see definition of davs2_ret_e * --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_send_packet(void *decoder, davs2_packet_t *packet); /** * --------------------------------------------------------------------------- * Function : decode one frame * Parameters : * [in] : decoder - pointer to the AVS2 decoder handler * [out] : headerset - pointer to output common frame information (would always appear before frame output) * [out] : out_frame - pointer to output frame information * Return : see definition of davs2_ret_e * --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_recv_frame(void *decoder, davs2_seq_info_t *headerset, davs2_picture_t *out_frame); /** * --------------------------------------------------------------------------- * Function : flush the decoder * Parameters : * [in] : decoder - decoder handle * [out] : headerset - pointer to output common frame information (would always appear before frame output) * [out] : out_frame - pointer to output frame information * Return : see definition of davs2_ret_e * --------------------------------------------------------------------------- */ DAVS2_API int davs2_decoder_flush(void *decoder, davs2_seq_info_t *headerset, davs2_picture_t *out_frame); /** * --------------------------------------------------------------------------- * Function : release one output frame * Parameters : * [in] : decoder - decoder handle * : out_frame - frame to recycle * Return : none * --------------------------------------------------------------------------- */ DAVS2_API void davs2_decoder_frame_unref(void *decoder, davs2_picture_t *out_frame); /** * --------------------------------------------------------------------------- * Function : close the AVS2 decoder * Parameters : * [in] : decoder - decoder handle * Return : none * --------------------------------------------------------------------------- */ DAVS2_API void davs2_decoder_close(void *decoder); #ifdef __cplusplus } #endif #endif // DAVS2_DAVS2_H davs2-1.6/source/test/000077500000000000000000000000001337322544400146655ustar00rootroot00000000000000davs2-1.6/source/test/getopt/000077500000000000000000000000001337322544400161675ustar00rootroot00000000000000davs2-1.6/source/test/getopt/getopt.c000066400000000000000000001071521337322544400176430ustar00rootroot00000000000000/* Getopt for GNU. NOTE: getopt is now part of the C library, so if you don't know what "Keep this file name-space clean" means, talk to drepper@gnu.org before changing it! Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ /* This tells Alpha OSF/1 not to define a getopt prototype in . Ditto for AIX 3.2 and . */ #ifndef _NO_PROTO # define _NO_PROTO #endif #ifdef HAVE_CONFIG_H # include #endif #if !defined __STDC__ || !__STDC__ /* This is a separate conditional since some stdc systems reject `defined (const)'. */ # ifndef const # define const # endif #endif #include /* Comment out all this code if we are using the GNU C Library, and are not actually compiling the library itself. This code is part of the GNU C Library, but also included in many other GNU distributions. Compiling and linking in this code is a waste when using the GNU C library (especially if it is a shared library). Rather than having every GNU program understand `configure --with-gnu-libc' and omit the object files, it is simpler to just do this in the source for each such file. */ #define GETOPT_INTERFACE_VERSION 2 #if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 # include # if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION # define ELIDE_CODE # endif #endif #ifndef ELIDE_CODE /* This needs to come after some library #include to get __GNU_LIBRARY__ defined. */ #ifdef __GNU_LIBRARY__ /* Don't include stdlib.h for non-GNU C libraries because some of them contain conflicting prototypes for getopt. */ # include # include #endif /* GNU C library. */ #ifdef VMS # include # if HAVE_STRING_H - 0 # include # endif #endif #ifndef _ /* This is for other GNU distributions with internationalized messages. */ # if defined HAVE_LIBINTL_H || defined _LIBC # include # ifndef _ # define _(msgid) gettext (msgid) # endif # else # define _(msgid) (msgid) # endif #endif /* This version of `getopt' appears to the caller like standard Unix `getopt' but it behaves differently for the user, since it allows the user to intersperse the options with the other arguments. As `getopt' works, it permutes the elements of ARGV so that, when it is done, all the options precede everything else. Thus all application programs are extended to handle flexible argument order. Setting the environment variable POSIXLY_CORRECT disables permutation. Then the behavior is completely standard. GNU application programs can use a third alternative mode in which they can distinguish the relative order of options and other arguments. */ #include "getopt.h" /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ /* 1003.2 says this must be 1 before any call. */ int optind = 1; /* Formerly, initialization of getopt depended on optind==0, which causes problems with re-calling getopt as programs generally don't know that. */ int __getopt_initialized; /* The next char to be scanned in the option-element in which the last option character we returned was found. This allows us to pick up the scan where we left off. If this is zero, or a null string, it means resume the scan by advancing to the next ARGV-element. */ static char *nextchar; /* Callers store zero here to inhibit the error message for unrecognized options. */ int opterr = 1; /* Set to an option character which was unrecognized. This must be initialized on some systems to avoid linking in the system's own getopt implementation. */ int optopt = '?'; /* Describe how to deal with options that follow non-option ARGV-elements. If the caller did not specify anything, the default is REQUIRE_ORDER if the environment variable POSIXLY_CORRECT is defined, PERMUTE otherwise. REQUIRE_ORDER means don't recognize them as options; stop option processing when the first non-option is seen. This is what Unix does. This mode of operation is selected by either setting the environment variable POSIXLY_CORRECT, or using `+' as the first character of the list of option characters. PERMUTE is the default. We permute the contents of ARGV as we scan, so that eventually all the non-options are at the end. This allows options to be given in any order, even with programs that were not written to expect this. RETURN_IN_ORDER is an option available to programs that were written to expect options and other ARGV-elements in any order and that care about the ordering of the two. We describe each non-option ARGV-element as if it were the argument of an option with character code 1. Using `-' as the first character of the list of option characters selects this mode of operation. The special argument `--' forces an end of option-scanning regardless of the value of `ordering'. In the case of RETURN_IN_ORDER, only `--' can cause `getopt' to return -1 with `optind' != ARGC. */ static enum { REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER } ordering; /* Value of POSIXLY_CORRECT environment variable. */ static char *posixly_correct; #ifdef __GNU_LIBRARY__ /* We want to avoid inclusion of string.h with non-GNU libraries because there are many ways it can cause trouble. On some systems, it contains special magic macros that don't work in GCC. */ # include # define my_index strchr #else #include /* Avoid depending on library functions or files whose names are inconsistent. */ #ifndef getenv extern char *getenv(); #endif static char * my_index(str, chr) const char *str; int chr; { while (*str) { if (*str == chr) { return (char *) str; } str++; } return 0; } /* If using GCC, we can safely declare strlen this way. If not using GCC, it is ok not to declare it. */ #ifdef __GNUC__ /* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. That was relevant to code that was here before. */ # if (!defined __STDC__ || !__STDC__) && !defined strlen /* gcc with -traditional declares the built-in strlen to return int, and has done so at least since version 2.4.5. -- rms. */ extern int strlen(const char *); # endif /* not __STDC__ */ #endif /* __GNUC__ */ #endif /* not __GNU_LIBRARY__ */ /* Handle permutation of arguments. */ /* Describe the part of ARGV that contains non-options that have been skipped. `first_nonopt' is the index in ARGV of the first of them; `last_nonopt' is the index after the last of them. */ static int first_nonopt; static int last_nonopt; #ifdef _LIBC /* Stored original parameters. XXX This is no good solution. We should rather copy the args so that we can compare them later. But we must not use malloc(3). */ extern int __libc_argc; extern char **__libc_argv; /* Bash 2.0 gives us an environment variable containing flags indicating ARGV elements that should not be considered arguments. */ # ifdef USE_NONOPTION_FLAGS /* Defined in getopt_init.c */ extern char *__getopt_nonoption_flags; static int nonoption_flags_max_len; static int nonoption_flags_len; # endif # ifdef USE_NONOPTION_FLAGS # define SWAP_FLAGS(ch1, ch2) \ if (nonoption_flags_len > 0) \ { \ char __tmp = __getopt_nonoption_flags[ch1]; \ __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ __getopt_nonoption_flags[ch2] = __tmp; \ } # else # define SWAP_FLAGS(ch1, ch2) # endif #else /* !_LIBC */ # define SWAP_FLAGS(ch1, ch2) #endif /* _LIBC */ /* Exchange two adjacent subsequences of ARGV. One subsequence is elements [first_nonopt,last_nonopt) which contains all the non-options that have been skipped so far. The other is elements [last_nonopt,optind), which contains all the options processed since those non-options were skipped. `first_nonopt' and `last_nonopt' are relocated so that they describe the new indices of the non-options in ARGV after they are moved. */ #if defined __STDC__ && __STDC__ static void exchange(char **); #endif static void exchange(argv) char **argv; { int bottom = first_nonopt; int middle = last_nonopt; int top = optind; char *tem; /* Exchange the shorter segment with the far end of the longer segment. That puts the shorter segment into the right place. It leaves the longer segment in the right place overall, but it consists of two parts that need to be swapped next. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS /* First make sure the handling of the `__getopt_nonoption_flags' string can work normally. Our top argument must be in the range of the string. */ if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) { /* We must extend the array. The user plays games with us and presents new arguments. */ char *new_str = malloc(top + 1); if (new_str == NULL) { nonoption_flags_len = nonoption_flags_max_len = 0; } else { memset(__mempcpy(new_str, __getopt_nonoption_flags, nonoption_flags_max_len), '\0', top + 1 - nonoption_flags_max_len); nonoption_flags_max_len = top + 1; __getopt_nonoption_flags = new_str; } } #endif while (top > middle && middle > bottom) { if (top - middle > middle - bottom) { /* Bottom segment is the short one. */ int len = middle - bottom; register int i; /* Swap it with the top part of the top segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[top - (middle - bottom) + i]; argv[top - (middle - bottom) + i] = tem; SWAP_FLAGS(bottom + i, top - (middle - bottom) + i); } /* Exclude the moved bottom segment from further swapping. */ top -= len; } else { /* Top segment is the short one. */ int len = top - middle; register int i; /* Swap it with the bottom part of the bottom segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[middle + i]; argv[middle + i] = tem; SWAP_FLAGS(bottom + i, middle + i); } /* Exclude the moved top segment from further swapping. */ bottom += len; } } /* Update records for the slots the non-options now occupy. */ first_nonopt += (optind - last_nonopt); last_nonopt = optind; } /* Initialize the internal data when the first call is made. */ #if defined __STDC__ && __STDC__ static const char *_getopt_initialize(int, char *const *, const char *); #endif static const char * _getopt_initialize(argc, argv, optstring) int argc; char *const *argv; const char *optstring; { /* Start processing options with ARGV-element 1 (since ARGV-element 0 is the program name); the sequence of previously skipped non-option ARGV-elements is empty. */ first_nonopt = last_nonopt = optind; nextchar = NULL; posixly_correct = getenv("POSIXLY_CORRECT"); /* Determine how to handle the ordering of options and nonoptions. */ if (optstring[0] == '-') { ordering = RETURN_IN_ORDER; ++optstring; } else if (optstring[0] == '+') { ordering = REQUIRE_ORDER; ++optstring; } else if (posixly_correct != NULL) { ordering = REQUIRE_ORDER; } else { ordering = PERMUTE; } #if defined _LIBC && defined USE_NONOPTION_FLAGS if (posixly_correct == NULL && argc == __libc_argc && argv == __libc_argv) { if (nonoption_flags_max_len == 0) { if (__getopt_nonoption_flags == NULL || __getopt_nonoption_flags[0] == '\0') { nonoption_flags_max_len = -1; } else { const char *orig_str = __getopt_nonoption_flags; int len = nonoption_flags_max_len = strlen(orig_str); if (nonoption_flags_max_len < argc) { nonoption_flags_max_len = argc; } __getopt_nonoption_flags = (char *) malloc(nonoption_flags_max_len); if (__getopt_nonoption_flags == NULL) { nonoption_flags_max_len = -1; } else memset(__mempcpy(__getopt_nonoption_flags, orig_str, len), '\0', nonoption_flags_max_len - len); } } nonoption_flags_len = nonoption_flags_max_len; } else { nonoption_flags_len = 0; } #endif return optstring; } /* Scan elements of ARGV (whose length is ARGC) for option characters given in OPTSTRING. If an element of ARGV starts with '-', and is not exactly "-" or "--", then it is an option element. The characters of this element (aside from the initial '-') are option characters. If `getopt' is called repeatedly, it returns successively each of the option characters from each of the option elements. If `getopt' finds another option character, it returns that character, updating `optind' and `nextchar' so that the next call to `getopt' can resume the scan with the following option character or ARGV-element. If there are no more option characters, `getopt' returns -1. Then `optind' is the index in ARGV of the first ARGV-element that is not an option. (The ARGV-elements have been permuted so that those that are not options now come last.) OPTSTRING is a string containing the legitimate option characters. If an option character is seen that is not listed in OPTSTRING, return '?' after printing an error message. If you set `opterr' to zero, the error message is suppressed but we still return '?'. If a char in OPTSTRING is followed by a colon, that means it wants an arg, so the following text in the same ARGV-element, or the text of the following ARGV-element, is returned in `optarg'. Two colons mean an option that wants an optional arg; if there is text in the current ARGV-element, it is returned in `optarg', otherwise `optarg' is set to zero. If OPTSTRING starts with `-' or `+', it requests different methods of handling the non-option ARGV-elements. See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. Long-named options begin with `--' instead of `-'. Their names may be abbreviated as long as the abbreviation is unique or is an exact match for some defined option. If they have an argument, it follows the option name in the same ARGV-element, separated from the option name by a `=', or else the in next ARGV-element. When `getopt' finds a long-named option, it returns 0 if that option's `flag' field is nonzero, the value of the option's `val' field if the `flag' field is zero. The elements of ARGV aren't really const, because we permute them. But we pretend they're const in the prototype to be compatible with other systems. LONGOPTS is a vector of `struct option' terminated by an element containing a name which is zero. LONGIND returns the index in LONGOPT of the long-named option found. It is only valid when a long-named option has been found by the most recent call. If LONG_ONLY is nonzero, '-' as well as '--' can introduce long-named options. */ int _getopt_internal(argc, argv, optstring, longopts, longind, long_only) int argc; char *const *argv; const char *optstring; const struct option *longopts; int32_t *longind; int long_only; { int print_errors = opterr; if (optstring[0] == ':') { print_errors = 0; } if (argc < 1) { return -1; } optarg = NULL; if (optind == 0 || !__getopt_initialized) { if (optind == 0) { optind = 1; /* Don't scan ARGV[0], the program name. */ } optstring = _getopt_initialize(argc, argv, optstring); __getopt_initialized = 1; } /* Test whether ARGV[optind] points to a non-option argument. Either it does not have option syntax, or there is an environment flag from the shell indicating it is not an option. The later information is only used when the used in the GNU libc. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ || (optind < nonoption_flags_len \ && __getopt_nonoption_flags[optind] == '1')) #else # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') #endif if (nextchar == NULL || *nextchar == '\0') { /* Advance to the next ARGV-element. */ /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been moved back by the user (who may also have changed the arguments). */ if (last_nonopt > optind) { last_nonopt = optind; } if (first_nonopt > optind) { first_nonopt = optind; } if (ordering == PERMUTE) { /* If we have just processed some options following some non-options, exchange them so that the options come first. */ if (first_nonopt != last_nonopt && last_nonopt != optind) { exchange((char **) argv); } else if (last_nonopt != optind) { first_nonopt = optind; } /* Skip any additional non-options and extend the range of non-options previously skipped. */ while (optind < argc && NONOPTION_P) { optind++; } last_nonopt = optind; } /* The special ARGV-element `--' means premature end of options. Skip it like a null option, then exchange with previous non-options as if it were an option, then skip everything else like a non-option. */ if (optind != argc && !strcmp(argv[optind], "--")) { optind++; if (first_nonopt != last_nonopt && last_nonopt != optind) { exchange((char **) argv); } else if (first_nonopt == last_nonopt) { first_nonopt = optind; } last_nonopt = argc; optind = argc; } /* If we have done all the ARGV-elements, stop the scan and back over any non-options that we skipped and permuted. */ if (optind == argc) { /* Set the next-arg-index to point at the non-options that we previously skipped, so the caller will digest them. */ if (first_nonopt != last_nonopt) { optind = first_nonopt; } return -1; } /* If we have come to a non-option and did not permute it, either stop the scan or describe it to the caller and pass it by. */ if (NONOPTION_P) { if (ordering == REQUIRE_ORDER) { return -1; } optarg = argv[optind++]; return 1; } /* We have found another option-ARGV-element. Skip the initial punctuation. */ nextchar = (argv[optind] + 1 + (longopts != NULL && argv[optind][1] == '-')); } /* Decode the current option-ARGV-element. */ /* Check whether the ARGV-element is a long option. If long_only and the ARGV-element has the form "-f", where f is a valid short option, don't consider it an abbreviated form of a long option that starts with f. Otherwise there would be no way to give the -f short option. On the other hand, if there's a long option "fubar" and the ARGV-element is "-fu", do consider that an abbreviation of the long option, just like "--fu", and not "-f" with arg "u". This distinction seems to be the most useful approach. */ if (longopts != NULL && (argv[optind][1] == '-' || (long_only && (argv[optind][2] || !my_index(optstring, argv[optind][1]))))) { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = -1; int option_index; for (nameend = nextchar; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp(p->name, nextchar, nameend - nextchar)) { if ((unsigned int)(nameend - nextchar) == (unsigned int) strlen(p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else if (long_only || pfound->has_arg != p->has_arg || pfound->flag != p->flag || pfound->val != p->val) /* Second or later nonexact match found. */ { ambig = 1; } } if (ambig && !exact) { if (print_errors) fprintf(stderr, _("%s: option `%s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen(nextchar); optind++; optopt = 0; return '?'; } if (pfound != NULL) { option_index = indfound; optind++; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) { optarg = nameend + 1; } else { if (print_errors) { if (argv[optind - 1][1] == '-') /* --option */ fprintf(stderr, _("%s: option `--%s' doesn't allow an argument\n"), argv[0], pfound->name); else /* +option or -option */ fprintf(stderr, _("%s: option `%c%s' doesn't allow an argument\n"), argv[0], argv[optind - 1][0], pfound->name); } nextchar += strlen(nextchar); optopt = pfound->val; return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) { optarg = argv[optind++]; } else { if (print_errors) fprintf(stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen(nextchar); optopt = pfound->val; return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen(nextchar); if (longind != NULL) { *longind = option_index; } if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } /* Can't find it as a long option. If this is not getopt_long_only, or the option starts with '--' or is not a valid short option, then it's an error. Otherwise interpret it as a short option. */ if (!long_only || argv[optind][1] == '-' || my_index(optstring, *nextchar) == NULL) { if (print_errors) { if (argv[optind][1] == '-') /* --option */ fprintf(stderr, _("%s: unrecognized option `--%s'\n"), argv[0], nextchar); else /* +option or -option */ fprintf(stderr, _("%s: unrecognized option `%c%s'\n"), argv[0], argv[optind][0], nextchar); } nextchar = (char *) ""; optind++; optopt = 0; return '?'; } } /* Look at and handle the next short option-character. */ { char c = *nextchar++; char *temp = my_index(optstring, c); /* Increment `optind' when we start to process its last character. */ if (*nextchar == '\0') { ++optind; } if (temp == NULL || c == ':') { if (print_errors) { if (posixly_correct) /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: illegal option -- %c\n"), argv[0], c); else fprintf(stderr, _("%s: invalid option -- %c\n"), argv[0], c); } optopt = c; return '?'; } /* Convenience. Treat POSIX -W foo same as long option --foo */ if (temp[0] == 'W' && temp[1] == ';') { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = 0; int option_index; /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') { c = ':'; } else { c = '?'; } return c; } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ { optarg = argv[optind++]; } /* optarg is now the argument, see if it's in the table of longopts. */ for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp(p->name, nextchar, nameend - nextchar)) { if ((unsigned int)(nameend - nextchar) == strlen(p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else /* Second or later nonexact match found. */ { ambig = 1; } } if (ambig && !exact) { if (print_errors) fprintf(stderr, _("%s: option `-W %s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen(nextchar); optind++; return '?'; } if (pfound != NULL) { option_index = indfound; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) { optarg = nameend + 1; } else { if (print_errors) fprintf(stderr, _("\ %s: option `-W %s' doesn't allow an argument\n"), argv[0], pfound->name); nextchar += strlen(nextchar); return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) { optarg = argv[optind++]; } else { if (print_errors) fprintf(stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen(nextchar); return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen(nextchar); if (longind != NULL) { *longind = option_index; } if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } nextchar = NULL; return 'W'; /* Let the application handle it. */ } if (temp[1] == ':') { if (temp[2] == ':') { /* This is an option that accepts an argument optionally. */ if (*nextchar != '\0') { optarg = nextchar; optind++; } else { optarg = NULL; } nextchar = NULL; } else { /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf(stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') { c = ':'; } else { c = '?'; } } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ { optarg = argv[optind++]; } nextchar = NULL; } } return c; } } int getopt(argc, argv, optstring) int argc; char *const *argv; const char *optstring; { return _getopt_internal(argc, argv, optstring, (const struct option *) 0, (int32_t *) 0, 0); } int getopt_long(argc, argv, options, long_options, opt_index) int argc; char *const *argv; const char *options; const struct option *long_options; int32_t *opt_index; { return _getopt_internal(argc, argv, options, long_options, opt_index, 0); } #endif /* Not ELIDE_CODE. */ #ifdef TEST /* Compile with -DTEST to make an executable for use in testing the above definition of `getopt'. */ int main(argc, argv) int argc; char **argv; { int c; int digit_optind = 0; while (1) { int this_option_optind = optind ? optind : 1; c = getopt(argc, argv, "abc:d:0123456789"); if (c == -1) { break; } switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (digit_optind != 0 && digit_optind != this_option_optind) { printf("digits occur in two different argv-elements.\n"); } digit_optind = this_option_optind; printf("option %c\n", c); break; case 'a': printf("option a\n"); break; case 'b': printf("option b\n"); break; case 'c': printf("option c with value `%s'\n", optarg); break; case '?': break; default: printf("?? getopt returned character code 0%o ??\n", c); } } if (optind < argc) { printf("non-option ARGV-elements: "); while (optind < argc) { printf("%s ", argv[optind++]); } printf("\n"); } exit(0); } #endif /* TEST */ davs2-1.6/source/test/getopt/getopt.h000066400000000000000000000152041337322544400176440ustar00rootroot00000000000000/* Declarations for getopt. Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #ifndef _GETOPT_H #ifndef __need_getopt # define _GETOPT_H 1 #endif #include /* If __GNU_LIBRARY__ is not already defined, either we are being used standalone, or this is the first header included in the source file. If we are being used with glibc, we need to include , but that does not exist if we are standalone. So: if __GNU_LIBRARY__ is not defined, include , which will pull in for us if it's from glibc. (Why ctype.h? It's guaranteed to exist and it doesn't flood the namespace with stuff the way some other headers do.) */ #if !defined __GNU_LIBRARY__ # include #endif #ifdef __cplusplus extern "C" { #endif /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ extern char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ extern int optind; /* Callers store zero here to inhibit the error message `getopt' prints for unrecognized options. */ extern int opterr; /* Set to an option character which was unrecognized. */ extern int optopt; #ifndef __need_getopt /* Describe the long-named options requested by the application. The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector of `struct option' terminated by an element containing a name which is zero. The field `has_arg' is: no_argument (or 0) if the option does not take an argument, required_argument (or 1) if the option requires an argument, optional_argument (or 2) if the option takes an optional argument. If the field `flag' is not NULL, it points to a variable that is set to the value given in the field `val' when the option is found, but left unchanged if the option is not found. To have a long-named option do something other than set an `int' to a compiled-in constant, such as set a value from `optarg', set the option's `flag' field to zero and its `val' field to a nonzero value (the equivalent single-letter option character, if there is one). For long options that have a zero `flag' field, `getopt' returns the contents of the `val' field. */ struct option { # if (defined __STDC__ && __STDC__) || defined __cplusplus const char *name; # else char *name; # endif /* has_arg can't be an enum because some compilers complain about type mismatches in all the code that assumes it is an int. */ int has_arg; int32_t *flag; int val; }; /* Names for the values of the `has_arg' field of `struct option'. */ # define no_argument 0 # define required_argument 1 # define optional_argument 2 #endif /* need getopt */ /* Get definitions and prototypes for functions to process the arguments in ARGV (ARGC of them, minus the program name) for options given in OPTS. Return the option character from OPTS just read. Return -1 when there are no more options. For unrecognized options, or options missing arguments, `optopt' is set to the option letter, and '?' is returned. The OPTS string is a list of characters which are recognized option letters, optionally followed by colons, specifying that that letter takes an argument, to be placed in `optarg'. If a letter in OPTS is followed by two colons, its argument is optional. This behavior is specific to the GNU `getopt'. The argument `--' causes premature termination of argument scanning, explicitly telling `getopt' that there are no more options. If OPTS begins with `--', then non-option arguments are treated as arguments to the option '\0'. This behavior is specific to the GNU `getopt'. */ #if (defined __STDC__ && __STDC__) || defined __cplusplus # ifdef __GNU_LIBRARY__ /* Many other libraries have conflicting prototypes for getopt, with differences in the consts, in stdlib.h. To avoid compilation errors, only prototype getopt for the GNU C library. */ extern int getopt(int __argc, char *const *__argv, const char *__shortopts); # else /* not __GNU_LIBRARY__ */ extern int getopt(); # endif /* __GNU_LIBRARY__ */ # ifndef __need_getopt extern int getopt_long(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind); extern int getopt_long_only(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind); /* Internal only. Users should not call this directly. */ extern int _getopt_internal(int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int32_t *__longind, int __long_only); # endif #else /* not __STDC__ */ extern int getopt(); # ifndef __need_getopt extern int getopt_long(); extern int getopt_long_only(); extern int _getopt_internal(); # endif #endif /* __STDC__ */ #ifdef __cplusplus } #endif /* Make sure we later can get all the definitions and declarations. */ #undef __need_getopt #endif /* getopt.h */ davs2-1.6/source/test/inputstream.h000066400000000000000000000104011337322544400174050ustar00rootroot00000000000000/* * inputstream.h * * Description of this file: * Inputstream Processing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_CHECKFRAME_H #define DAVS2_CHECKFRAME_H #include "utils.h" #include #include #include #include #include /* --------------------------------------------------------------------------- */ #define ISPIC(x) ((x) == 0xB3 || (x) == 0xB6) #define ISUNIT(x) ((x) == 0xB0 || (x) == 0xB1 || (x) == 0xB7 || ISPIC(x)) /* --------------------------------------------------------------------------- */ static __inline const uint8_t * find_start_code(const uint8_t *data, int len) { while (len >= 4 && (*(int *)data & 0x00FFFFFF) != 0x00010000) { ++data; --len; } return len >= 4 ? data : 0; } /* --------------------------------------------------------------------------- */ static int check_frame(const uint8_t *data, int len) { const uint8_t *p; const uint8_t *data0 = data; const int len0 = len; while (((p = (uint8_t *)find_start_code(data, len)) != 0) && !ISUNIT(p[3])) { len -= (int)(p - data + 4); data = p + 4; } return (int)(p ? p - data0 : len0 + 1); } /* --------------------------------------------------------------------------- */ static int find_one_frame(uint8_t * data, int len, int *start, int *end) { if ((*start = check_frame(data, len)) > len) { return -1; } if ((*end = check_frame(data + *start + 4, len - *start - 4)) <= len) { *end += *start + 4; } return 0; } /* --------------------------------------------------------------------------- */ static int count_frames(uint8_t *data, int size) { int count = 0; int start, end; for (;;) { if (find_one_frame(data, size, &start, &end) < 0) { break; } if (ISPIC(data[start + 3])) { count++; } data += end; size -= end; } return count; } /* --------------------------------------------------------------------------- */ static int read_input_file(davs2_input_param_t *p_param, uint8_t **data, int *size, int *frames, float errrate) { /* get size of input file */ fseek(p_param->g_infile, 0, SEEK_END); *size = ftell(p_param->g_infile); fseek(p_param->g_infile, 0, SEEK_SET); /* memory for stream buffer */ if ((*data = (uint8_t *)calloc(*size + 1024, sizeof(uint8_t))) == NULL) { show_message(CONSOLE_RED, "failed to alloc memory for input file.\n"); return -1; } /* read stream data */ if (fread(*data, *size, 1, p_param->g_infile) < 1) { show_message(CONSOLE_RED, "failed to read input file.\n"); free(*data); *data = NULL; return -1; } if (errrate != 0) { show_message(CONSOLE_WHITE, "noise interfering is enabled:\n"); } /* get total frames */ *frames = count_frames(*data, *size); return 0; } #endif /// DAVS2_CHECKFRAME_H davs2-1.6/source/test/md5.h000066400000000000000000000207431337322544400155310ustar00rootroot00000000000000/* * md5.h * * Description of this file: * MD5 calculate function of davs2. * */ /* The copyright in this software is being made available under the BSD * License, included below. This software may be subject to other third party * and contributor rights, including patent rights, and no such rights are * granted under this license. * * Copyright (c) 2002-2016, Audio Video coding Standard Workgroup of China * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of Audio Video coding Standard Workgroup of China * nor the names of its contributors maybe used to endorse or promote products * derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * Huiwen REN * etc. * */ #ifndef DAVS2_MD5_H #define DAVS2_MD5_H #include #include #include #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | (~z))) #define RL(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) #define PP(x) (x<<24)|((x<<8)&0xff0000)|((x>>8)&0xff00)|(x>>24) #define FF(a, b, c, d, x, s, ac) a = b + (RL((a + F(b,c,d) + x + ac),s)) #define GG(a, b, c, d, x, s, ac) a = b + (RL((a + G(b,c,d) + x + ac),s)) #define HH(a, b, c, d, x, s, ac) a = b + (RL((a + H(b,c,d) + x + ac),s)) #define II(a, b, c, d, x, s, ac) a = b + (RL((a + I(b,c,d) + x + ac),s)) void md5(unsigned int *pA, unsigned int *pB, unsigned int *pC, unsigned int *pD, unsigned int x[16]) { unsigned int a, b, c, d; a = *pA; b = *pB; c = *pC; d = *pD; /**//* Round 1 */ FF(a, b, c, d, x[ 0], 7, 0xd76aa478); /**/ /* 1 */ FF(d, a, b, c, x[ 1], 12, 0xe8c7b756); /**/ /* 2 */ FF(c, d, a, b, x[ 2], 17, 0x242070db); /**/ /* 3 */ FF(b, c, d, a, x[ 3], 22, 0xc1bdceee); /**/ /* 4 */ FF(a, b, c, d, x[ 4], 7, 0xf57c0faf); /**/ /* 5 */ FF(d, a, b, c, x[ 5], 12, 0x4787c62a); /**/ /* 6 */ FF(c, d, a, b, x[ 6], 17, 0xa8304613); /**/ /* 7 */ FF(b, c, d, a, x[ 7], 22, 0xfd469501); /**/ /* 8 */ FF(a, b, c, d, x[ 8], 7, 0x698098d8); /**/ /* 9 */ FF(d, a, b, c, x[ 9], 12, 0x8b44f7af); /**/ /* 10 */ FF(c, d, a, b, x[10], 17, 0xffff5bb1); /**/ /* 11 */ FF(b, c, d, a, x[11], 22, 0x895cd7be); /**/ /* 12 */ FF(a, b, c, d, x[12], 7, 0x6b901122); /**/ /* 13 */ FF(d, a, b, c, x[13], 12, 0xfd987193); /**/ /* 14 */ FF(c, d, a, b, x[14], 17, 0xa679438e); /**/ /* 15 */ FF(b, c, d, a, x[15], 22, 0x49b40821); /**/ /* 16 */ /**//* Round 2 */ GG(a, b, c, d, x[ 1], 5, 0xf61e2562); /**/ /* 17 */ GG(d, a, b, c, x[ 6], 9, 0xc040b340); /**/ /* 18 */ GG(c, d, a, b, x[11], 14, 0x265e5a51); /**/ /* 19 */ GG(b, c, d, a, x[ 0], 20, 0xe9b6c7aa); /**/ /* 20 */ GG(a, b, c, d, x[ 5], 5, 0xd62f105d); /**/ /* 21 */ GG(d, a, b, c, x[10], 9, 0x02441453); /**/ /* 22 */ GG(c, d, a, b, x[15], 14, 0xd8a1e681); /**/ /* 23 */ GG(b, c, d, a, x[ 4], 20, 0xe7d3fbc8); /**/ /* 24 */ GG(a, b, c, d, x[ 9], 5, 0x21e1cde6); /**/ /* 25 */ GG(d, a, b, c, x[14], 9, 0xc33707d6); /**/ /* 26 */ GG(c, d, a, b, x[ 3], 14, 0xf4d50d87); /**/ /* 27 */ GG(b, c, d, a, x[ 8], 20, 0x455a14ed); /**/ /* 28 */ GG(a, b, c, d, x[13], 5, 0xa9e3e905); /**/ /* 29 */ GG(d, a, b, c, x[ 2], 9, 0xfcefa3f8); /**/ /* 30 */ GG(c, d, a, b, x[ 7], 14, 0x676f02d9); /**/ /* 31 */ GG(b, c, d, a, x[12], 20, 0x8d2a4c8a); /**/ /* 32 */ /**//* Round 3 */ HH(a, b, c, d, x[ 5], 4, 0xfffa3942); /**/ /* 33 */ HH(d, a, b, c, x[ 8], 11, 0x8771f681); /**/ /* 34 */ HH(c, d, a, b, x[11], 16, 0x6d9d6122); /**/ /* 35 */ HH(b, c, d, a, x[14], 23, 0xfde5380c); /**/ /* 36 */ HH(a, b, c, d, x[ 1], 4, 0xa4beea44); /**/ /* 37 */ HH(d, a, b, c, x[ 4], 11, 0x4bdecfa9); /**/ /* 38 */ HH(c, d, a, b, x[ 7], 16, 0xf6bb4b60); /**/ /* 39 */ HH(b, c, d, a, x[10], 23, 0xbebfbc70); /**/ /* 40 */ HH(a, b, c, d, x[13], 4, 0x289b7ec6); /**/ /* 41 */ HH(d, a, b, c, x[ 0], 11, 0xeaa127fa); /**/ /* 42 */ HH(c, d, a, b, x[ 3], 16, 0xd4ef3085); /**/ /* 43 */ HH(b, c, d, a, x[ 6], 23, 0x04881d05); /**/ /* 44 */ HH(a, b, c, d, x[ 9], 4, 0xd9d4d039); /**/ /* 45 */ HH(d, a, b, c, x[12], 11, 0xe6db99e5); /**/ /* 46 */ HH(c, d, a, b, x[15], 16, 0x1fa27cf8); /**/ /* 47 */ HH(b, c, d, a, x[ 2], 23, 0xc4ac5665); /**/ /* 48 */ /**//* Round 4 */ II(a, b, c, d, x[ 0], 6, 0xf4292244); /**/ /* 49 */ II(d, a, b, c, x[ 7], 10, 0x432aff97); /**/ /* 50 */ II(c, d, a, b, x[14], 15, 0xab9423a7); /**/ /* 51 */ II(b, c, d, a, x[ 5], 21, 0xfc93a039); /**/ /* 52 */ II(a, b, c, d, x[12], 6, 0x655b59c3); /**/ /* 53 */ II(d, a, b, c, x[ 3], 10, 0x8f0ccc92); /**/ /* 54 */ II(c, d, a, b, x[10], 15, 0xffeff47d); /**/ /* 55 */ II(b, c, d, a, x[ 1], 21, 0x85845dd1); /**/ /* 56 */ II(a, b, c, d, x[ 8], 6, 0x6fa87e4f); /**/ /* 57 */ II(d, a, b, c, x[15], 10, 0xfe2ce6e0); /**/ /* 58 */ II(c, d, a, b, x[ 6], 15, 0xa3014314); /**/ /* 59 */ II(b, c, d, a, x[13], 21, 0x4e0811a1); /**/ /* 60 */ II(a, b, c, d, x[ 4], 6, 0xf7537e82); /**/ /* 61 */ II(d, a, b, c, x[11], 10, 0xbd3af235); /**/ /* 62 */ II(c, d, a, b, x[ 2], 15, 0x2ad7d2bb); /**/ /* 63 */ II(b, c, d, a, x[ 9], 21, 0xeb86d391); /**/ /* 64 */ *pA += a; *pB += b; *pC += c; *pD += d; } long long FileMD5(const char *filename, unsigned int md5value[4]) { FILE *p_infile = NULL; int i; unsigned int flen[2]; long long len; unsigned int A, B, C, D; unsigned int x[16]; memset(md5value, 0, 4 * sizeof(unsigned int)); if (filename == NULL) { return 0; } if (strlen(filename) > 0 && (p_infile = fopen(filename, "rb")) == NULL) { show_message(CONSOLE_RED, "Input file %s does not exist", filename); return 0; } fseek(p_infile, 0, SEEK_END); len = ftell(p_infile); fseek(p_infile, 0, SEEK_SET); if (len == -1) { show_message(CONSOLE_RED, "Input file %s is too large to calculate md5!\n", filename); fclose(p_infile); return 0; } A = 0x67452301, B = 0xefcdab89, C = 0x98badcfe, D = 0x10325476; flen[1] = (unsigned int)(len / 0x20000000); flen[0] = (unsigned int)((len % 0x20000000) * 8); memset(x, 0, 64); fread(&x, 4, 16, p_infile); for (i = 0; i < len / 64; i++) { md5(&A, &B, &C, &D, x); memset(x, 0, 64); fread(&x, 4, 16, p_infile); } ((char *)x)[len % 64] = 128; if (len % 64 > 55) { md5(&A, &B, &C, &D, x); memset(x, 0, 64); } memcpy(x + 14, flen, 8); md5(&A, &B, &C, &D, x); fclose(p_infile); md5value[0] = PP(A); md5value[1] = PP(B); md5value[2] = PP(C); md5value[3] = PP(D); return len; } #endif // DAVS2_MD5_H davs2-1.6/source/test/parse_args.h000066400000000000000000000173731337322544400171770ustar00rootroot00000000000000/* * parse_args.h * * Description of this file: * Argument Parsing functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_GETOPT_H #define DAVS2_GETOPT_H #include #include #include #if _WIN32 #include #include #endif #include "utils.h" typedef struct davs2_input_param_t { const char *s_infile; const char *s_outfile; const char *s_recfile; const char *s_md5; int g_verbose; int g_psnr; int g_threads; int b_y4m; // Y4M or YUV FILE *g_infile; FILE *g_recfile; FILE *g_outfile; } davs2_input_param_t; #if defined(__ICL) || defined(_MSC_VER) #define strcasecmp _stricmp #endif /* 包含附加参数的,在字母后面需要加上冒号 */ static const char *optString = "i:o:r:m:t:vh?"; static const struct option longOpts[] = { {"input", required_argument, NULL, 'i'}, {"output", required_argument, NULL, 'o'}, {"psnr", required_argument, NULL, 'r'}, {"md5", required_argument, NULL, 'm'}, {"threads", required_argument, NULL, 't'}, {"verbose", no_argument, NULL, 'v'}, {"help", no_argument, NULL, 'h'}, {NULL, no_argument, NULL, 0} }; /* --------------------------------------------------------------------------- */ static void display_usage(void) { /* 运行参数说明 */ const char * usage = "usage: davs2.exe --input avsfile --output=outputfile [--psnr=recfile] [--threads=threads] [--verbose]"; show_message(CONSOLE_RED, "davs2.exe 运行参数说明:\n %s\n", usage); show_message(CONSOLE_RED, " --input=test.avs 或 -i test.avs 设置输入文件路径\n"); show_message(CONSOLE_RED, " --output=test_dec.yuv 或 -o test_dec.yuv 设置输出路径\n"); show_message(CONSOLE_RED, " --psnr=test_rec.yuv 或 -r test_rec,yuv 设置编码参考文件,用于计算是否匹配\n"); show_message(CONSOLE_RED, " --md5=md5 或 -m md5 设置编码参考MD5,用于计算是否匹配\n"); show_message(CONSOLE_RED, " --threads=N 或 -t N 设置解码线程数,默认1\n"); show_message(CONSOLE_RED, " --verbose 或 -v 设置显示每帧解码数据\n"); show_message(CONSOLE_RED, " --help 或 -h 显示此提示信息\n"); show_message(CONSOLE_RED, "------------------------------------------------------------\n"); } /* --------------------------------------------------------------------------- */ static int parse_args(davs2_input_param_t *p_param, int argc, char **argv) { char title[1024] = {0}; int i; int opt = 0; int longIndex = 0; for (i = 0; i < argc; ++i) { sprintf(&title[strlen(title)], "%s ", argv[i]); } show_message(CONSOLE_WHITE, "%s\n\n", title); if (argc < 2) { display_usage(); return -1; } /* Initialize globalArgs before we get to work. */ p_param->s_infile = NULL; p_param->s_outfile = NULL; p_param->s_recfile = NULL; p_param->s_md5 = NULL; p_param->g_infile = NULL; p_param->g_outfile = NULL; p_param->g_recfile = NULL; p_param->g_verbose = 0; p_param->g_psnr = 0; p_param->g_threads = 1; p_param->b_y4m = 0; opt = getopt_long(argc, argv, optString, longOpts, &longIndex); while (opt != -1) { switch (opt) { case 'i': p_param->s_infile = optarg; break; case 'o': p_param->s_outfile = optarg; break; case 'r': p_param->s_recfile = optarg; break; case 'm': p_param->s_md5 = optarg; break; case 'v': p_param->g_verbose = 1; break; case 't': p_param->g_threads = atoi(optarg); break; case 'h': /* fall-through is intentional */ case '?': display_usage(); break; case 0: /* long option without a short arg */ break; default: /* You won't actually get here. */ break; } opt = getopt_long(argc, argv, optString, longOpts, &longIndex); } if (p_param->s_infile == NULL) { display_usage(); show_message(CONSOLE_RED, "missing input file.\n"); return -1; } p_param->g_infile = fopen(p_param->s_infile, "rb"); if (p_param->s_recfile != NULL) { p_param->g_recfile = fopen(p_param->s_recfile, "rb"); } if (p_param->s_outfile != NULL) { if (!strcmp(p_param->s_outfile, "stdout")) { p_param->g_outfile = stdout; } else { p_param->g_outfile = fopen(p_param->s_outfile, "wb"); } } else if (p_param->g_outfile == NULL) { display_usage(); show_message(CONSOLE_RED, "WARN: missing output file.\n"); } /* open stream file */ if (p_param->g_infile == NULL) { show_message(CONSOLE_RED, "ERROR: failed to open input file: %s\n", p_param->s_infile); return -1; } /* open rec file */ if (p_param->s_recfile != NULL && p_param->g_recfile == NULL) { show_message(CONSOLE_RED, "ERROR: failed to open reference file: %s\n", p_param->s_recfile); } p_param->g_psnr = (p_param->g_recfile != NULL); /* open output file */ if (p_param->s_outfile != NULL && p_param->g_outfile == NULL) { show_message(CONSOLE_RED, "ERROR: failed to open output file: %s\n", p_param->s_outfile); } else { int l = strlen(p_param->s_outfile); if (l > 4) { if (!strcmp(p_param->s_outfile + l - 4, ".y4m")) { p_param->b_y4m = 1; } } if (p_param->g_outfile == stdout) { #if _WIN32 setmode(fileno(stdout), O_BINARY); #endif p_param->b_y4m = 1; } } /* get md5 */ if (p_param->s_md5 && strlen(p_param->s_md5) != 32) { show_message(CONSOLE_RED, "ERROR: invalid md5 value"); } show_message(CONSOLE_WHITE, "--------------------------------------------------\n"); show_message(CONSOLE_WHITE, " AVS2 file : %s\n", p_param->s_infile); show_message(CONSOLE_WHITE, " Reference file : %s\n", p_param->s_recfile); show_message(CONSOLE_WHITE, " Output file : %s\n", p_param->s_outfile); show_message(CONSOLE_WHITE, "--------------------------------------------------\n"); return 0; } #endif /// DAVS2_GETOPT_H davs2-1.6/source/test/psnr.h000066400000000000000000000250241337322544400160230ustar00rootroot00000000000000/* * psnr.h * * Description of this file: * PSNR Calculating functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_PSNR_H #define DAVS2_PSNR_H #ifdef _MSC_VER #undef fseek #define fseek _fseeki64 #else //! for linux #define _FILE_OFFSET_BITS 64 // for 64 bit fseeko #define fseek fseeko #endif #include #include #include #include #if HAVE_STDINT_H #include #else #include #endif int g_width = 0; int g_lines = 0; int b_output_error_position = 1; double g_sum_psnr_y = 0.0; double g_sum_psnr_u = 0.0; double g_sum_psnr_v = 0.0; uint8_t *g_recbuf = NULL; /* --------------------------------------------------------------------------- */ static __inline uint64_t cal_ssd_16bit(int width, int height, uint16_t *rec, int rec_stride, uint16_t *dst, int dst_stride) { uint64_t d = 0; int i, j; if (rec_stride == dst_stride) { if (memcmp(dst, rec, rec_stride * height * 2) == 0) { return 0; } } for (j = 0; j < height; j++) { for (i = 0; i < width; i++) { int t = dst[i] - rec[i]; d += t * t; } rec += rec_stride; dst += dst_stride; } return d; } /* --------------------------------------------------------------------------- */ static __inline uint64_t cal_ssd_8bit(int width, int height, uint8_t *rec, int rec_stride, uint8_t *dst, int dst_stride) { uint64_t d = 0; int i, j; if (rec_stride == dst_stride) { if (memcmp(dst, rec, rec_stride * height) == 0) { return 0; } } for (j = 0; j < height; j++) { for (i = 0; i < width; i++) { int t = dst[i] - rec[i]; d += t * t; } rec += rec_stride; dst += dst_stride; } return d; } /* --------------------------------------------------------------------------- * Function : calculate the SSD of 2 frames * Parameters : * [in ] : width - width of frame * : height - height of frame * : rec - pointer to reconstructed frame buffer * : rec_stride - stride of reconstructed frame * : dst - pointer to decoded frame buffer * : dst_stride - stride of decoded frame * [out] : none * Return : mad of 2 frames * --------------------------------------------------------------------------- */ static __inline uint64_t cal_ssd(int width, int height, uint8_t *rec, int rec_stride, uint8_t *dst, int dst_stride, int bytes_per_sample) { if (bytes_per_sample == 2) { return cal_ssd_16bit(width, height, (uint16_t *)rec, rec_stride, (uint16_t *)dst, dst_stride >> 1); } else { return cal_ssd_8bit(width, height, rec, rec_stride, dst, dst_stride); } } /* --------------------------------------------------------------------------- */ static void find_first_mismatch_point_16bit(int width, int height, uint16_t *rec, int rec_stride, uint16_t *dst, int dst_stride, int *x, int *y) { int i, j; *x = -1; *y = -1; for (j = 0; j < height; j++) { for (i = 0; i < width; i++) { int t = dst[i] - rec[i]; if (t != 0) { *x = i; *y = j; break; } } rec += rec_stride; dst += dst_stride; } } /* --------------------------------------------------------------------------- */ static void find_first_mismatch_point_8bit(int width, int height, uint8_t *rec, int rec_stride, uint8_t *dst, int dst_stride, int *x, int *y) { int i, j; *x = -1; *y = -1; for (j = 0; j < height; j++) { for (i = 0; i < width; i++) { int t = dst[i] - rec[i]; if (t != 0) { *x = i; *y = j; break; } } rec += rec_stride; dst += dst_stride; } } /* --------------------------------------------------------------------------- * Function : calculate the SSD of 2 frames * Parameters : * [in ] : width - width of frame * : height - height of frame * : rec - pointer to reconstructed frame buffer * : rec_stride - stride of reconstructed frame * : dst - pointer to decoded frame buffer * : dst_stride - stride of decoded frame * [out] : none * Return : x, y position of first mismatch point * --------------------------------------------------------------------------- */ static void find_first_mismatch_point(int width, int height, uint8_t *rec, int rec_stride, uint8_t *dst, int dst_stride, int bytes_per_sample, int *x, int *y) { if (bytes_per_sample == 2) { find_first_mismatch_point_16bit(width, height, (uint16_t *)rec, rec_stride, (uint16_t *)dst, dst_stride >> 1, x, y); } else { find_first_mismatch_point_8bit(width, height, rec, rec_stride, dst, dst_stride, x, y); } } /* --------------------------------------------------------------------------- */ static double get_psnr_with_ssd(double f_max, uint64_t diff) { if (diff > 0) { return 10.0 * log10(f_max / diff); } else { return 0; } } /* --------------------------------------------------------------------------- * Function : calculate and output the psnr (only for YUV 4:2:0) * Parameters : * [in ] : rec - pointer to buffer of reconstructed picture * : dst - pointer to buffer of decoded picture * : width - width of picture * : height - height of picture * [out] : none * Return : void * --------------------------------------------------------------------------- */ int cal_psnr(int number, uint8_t *dst[3], int strides[3], FILE *f_rec, int width, int height, int num_planes, double *psnr_y, double *psnr_u, double *psnr_v, int bytes_per_sample, int bit_depth) { int stride_ref = width; /* stride of frame/field (luma) */ int size_l = width * height; /* size of frame/field (luma) */ uint8_t *p1; /* pointer to buffer of reconstructed picture */ uint8_t *p2; /* pointer to buffer of decoded picture */ uint64_t diff; /* difference between decoded and reconstructed picture */ size_t size_frame = num_planes == 3 ? (bytes_per_sample * size_l * 3) >> 1 : (bytes_per_sample * size_l); //solve warning C4018 double f_max_signal = ((1 << bit_depth) - 1) * ((1 << bit_depth) - 1); int64_t frameno = number; *psnr_y = *psnr_u = *psnr_v = 0.f; if (width != g_width || height != g_lines) { if (g_recbuf) { free(g_recbuf); g_recbuf = NULL; } g_recbuf = (uint8_t *)malloc(size_frame); if (g_recbuf == NULL) { return -1; } g_width = width; g_lines = height; } if (g_recbuf == 0) { return -1; } fseek(f_rec, size_frame * frameno, SEEK_SET); if (fread(g_recbuf, 1, size_frame, f_rec) < size_frame) { return -1; } p1 = g_recbuf; p2 = dst[0]; diff = cal_ssd(width, height, p1, stride_ref, p2, strides[0], bytes_per_sample); *psnr_y = get_psnr_with_ssd(f_max_signal * size_l, diff); g_sum_psnr_y += *psnr_y; if (diff != 0 && b_output_error_position) { int x, y; find_first_mismatch_point(width, height, p1, stride_ref, p2, strides[0], bytes_per_sample, &x, &y); show_message(CONSOLE_RED, "mismatch POC: %3d, Y(%d, %d)\n", number, x, y); b_output_error_position = 0; } if (num_planes == 3) { width >>= 1; // width of frame/field (chroma) height >>= 1; // height of frame/field (chroma, with padding) stride_ref >>= 1; // stride of frame/field (chroma) /* PSNR U */ p1 += size_l * bytes_per_sample; p2 = dst[1]; diff = cal_ssd(width, height, p1, stride_ref, p2, strides[1], bytes_per_sample); *psnr_u = get_psnr_with_ssd(f_max_signal * size_l, diff << 2); g_sum_psnr_u += *psnr_u; if (diff != 0 && b_output_error_position) { int x, y; find_first_mismatch_point(width, height, p1, stride_ref, p2, strides[1], bytes_per_sample, &x, &y); show_message(CONSOLE_RED, "mismatch POC: %3d, U (%d, %d) => Y(%d, %d)\n", number, x, y, 2 * x, 2 * y); b_output_error_position = 0; } /* PSNR V */ p1 += (size_l * bytes_per_sample) >> 2; p2 = dst[2]; diff = cal_ssd(width, height, p1, stride_ref, p2, strides[2], bytes_per_sample); *psnr_v = get_psnr_with_ssd(f_max_signal * size_l, diff << 2); g_sum_psnr_v += *psnr_v; if (diff != 0 && b_output_error_position) { int x, y; find_first_mismatch_point(width, height, p1, stride_ref, p2, strides[2], bytes_per_sample, &x, &y); show_message(CONSOLE_RED, "mismatch POC: %3d, V (%d, %d) => Y(%d, %d)\n", number, x, y, 2 * x, 2 * y); b_output_error_position = 0; } } return 0; } #endif /// DAVS2_PSNR_H davs2-1.6/source/test/test.c000066400000000000000000000303651337322544400160170ustar00rootroot00000000000000/* * test.c * * Description of this file: * test the AVS2 Video Decoder davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #if defined(_MSC_VER) #define WIN32_LEAN_AND_MEAN #define _CRT_NONSTDC_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include "davs2.h" #include "utils.h" #include "psnr.h" #include "parse_args.h" #include "inputstream.h" #include "md5.h" #if defined(_MSC_VER) #pragma comment(lib, "libdavs2.lib") #endif #if defined(__cplusplus) extern "C" { #endif /* __cplusplus */ /** * =========================================================================== * macro defines * =========================================================================== */ #define CTRL_LOOP_DEC_FILE 0 /* ѭһESļ */ /* --------------------------------------------------------------------------- * disable warning C4100: : unreferenced formal parameter */ #ifndef UNREFERENCED_PARAMETER #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define UNREFERENCED_PARAMETER(v) (v) #else #define UNREFERENCED_PARAMETER(v) (void)(v) #endif #endif /** * =========================================================================== * global variables * =========================================================================== */ int g_frmcount = 0; int g_psnrfail = 0; unsigned int MD5val[4]; char MD5str[33]; davs2_input_param_t inputparam = { NULL, NULL, NULL, NULL, 0, 0, 0, 0 }; /** * =========================================================================== * function defines * =========================================================================== */ /* --------------------------------------------------------------------------- */ static void output_decoded_frame(davs2_picture_t *pic, davs2_seq_info_t *headerset, int ret_type, int num_frames) { static char IMGTYPE[] = {'I', 'P', 'B', 'G', 'F', 'S', '\x0'}; double psnr_y = 0.0f, psnr_u = 0.0f, psnr_v = 0.0f; if (headerset == NULL) { return; } if (pic == NULL || ret_type == DAVS2_GOT_HEADER) { show_message(CONSOLE_GREEN, " Sequence size: %dx%d, %d/%d-bit %.3lf Hz. ProfileLevel: 0x%x-0x%x\n\n", headerset->width, headerset->height, headerset->internal_bit_depth, headerset->output_bit_depth, headerset->frame_rate, headerset->profile_id, headerset->level_id); if (inputparam.b_y4m) { static const int FRAME_RATE[9][2] = { { 0, 1}, // invalid { 24000, 1001 }, { 24, 1 }, { 25, 1 }, { 30000, 1001 }, { 30, 1 }, { 50, 1 }, { 60000, 1001 }, { 60, 1 } }; int fps_num = FRAME_RATE[headerset->frame_rate_id][0]; int fps_den = FRAME_RATE[headerset->frame_rate_id][1]; write_y4m_header(inputparam.g_outfile, headerset->width, headerset->height, fps_num, fps_den, headerset->output_bit_depth); } return; } if (inputparam.g_psnr) { int ret = cal_psnr(pic->pic_order_count, pic->planes, pic->strides, inputparam.g_recfile, pic->widths[0], pic->lines[0], pic->num_planes, &psnr_y, &psnr_u, &psnr_v, pic->bytes_per_sample, pic->bit_depth); int psnr = (psnr_y != 0 || psnr_u != 0 || psnr_v != 0); if (ret < 0) { g_psnrfail = 1; show_message(CONSOLE_RED, "failed to cal psnr for frame %d(%d).\t\t\t\t\n", g_frmcount, pic->pic_order_count); } else { if (inputparam.g_verbose || psnr) { show_message(psnr ? CONSOLE_RED : CONSOLE_WHITE, "%5d(%d)\t(%c) %3d\t%8.4lf %8.4lf %8.4lf \t%6lld %6lld\n", g_frmcount, pic->pic_order_count, IMGTYPE[pic->type], pic->qp, psnr_y, psnr_u, psnr_v, pic->pts, pic->dts); } } } else if (inputparam.g_verbose) { show_message(CONSOLE_WHITE, "%5d(%d)\t(%c)\t%3d\n", g_frmcount, pic->pic_order_count, IMGTYPE[pic->type], pic->qp); } g_frmcount++; if (inputparam.g_verbose == 0) { show_progress(g_frmcount, num_frames); } if (inputparam.g_outfile) { write_frame(pic, inputparam.g_outfile, inputparam.b_y4m); } } /* --------------------------------------------------------------------------- * data_buf - pointer to bitstream buffer * data_len - number of bytes in bitstream buffer * frames - number of frames in bitstream buffer */ void test_decoder(uint8_t *data_buf, int data_len, int num_frames, char *dst) { const double f_time_fac = 1.0 / (double)CLOCKS_PER_SEC; davs2_param_t param; // decoding parameters davs2_packet_t packet; // input bitstream davs2_picture_t out_frame; // output data, frame data davs2_seq_info_t headerset; // output data, sequence header int got_frame; #if CTRL_LOOP_DEC_FILE uint8_t *bak_data_buf = data_buf; int bak_data_len = data_len; int num_loop = 5; // ѭ #endif int64_t time0, time1; void *decoder; const uint8_t *data = data_buf; const uint8_t *data_next_start_code; int user_dts = 0; // only used to check the returning value of DTS and PTS /* init the decoder */ memset(¶m, 0, sizeof(param)); param.threads = inputparam.g_threads; param.opaque = (void *)(intptr_t)num_frames; param.info_level = DAVS2_LOG_DEBUG; param.disable_avx = 0; // on some platforms, disable AVX (setting to 1) would be faster decoder = davs2_decoder_open(¶m); time0 = get_time(); /* do decoding */ for (;;) { int len; data_next_start_code = find_start_code(data + 4, data_len - 4); if (data_next_start_code) { len = data_next_start_code - data; } else { len = data_len; } packet.data = data; packet.len = len; // set PTS/DTS, which was only used to check whether they could be passed out rightly packet.pts = user_dts; packet.dts = -user_dts; user_dts++; got_frame = davs2_decoder_send_packet(decoder, &packet); if (got_frame == DAVS2_ERROR) { show_message(CONSOLE_RED, "Error: An decoder error counted\n"); break; } got_frame = davs2_decoder_recv_frame(decoder, &headerset, &out_frame); if (got_frame != DAVS2_DEFAULT) { output_decoded_frame(&out_frame, &headerset, got_frame, num_frames); davs2_decoder_frame_unref(decoder, &out_frame); } data_len -= len; data += len; // could not be [data = data_next_start_code] if (!data_len) { #if CTRL_LOOP_DEC_FILE data_len = bak_data_len; data = data_buf; num_loop--; if (num_loop <= 0) { break; } #else break; /* end of bitstream */ #endif } } /* flush the decoder */ for (;;) { got_frame = davs2_decoder_flush(decoder, &headerset, &out_frame); if (got_frame == DAVS2_ERROR || got_frame == DAVS2_END) { break; } if (got_frame != DAVS2_DEFAULT) { output_decoded_frame(&out_frame, &headerset, got_frame, num_frames); davs2_decoder_frame_unref(decoder, &out_frame); } } time1 = get_time(); /* close the decoder */ davs2_decoder_close(decoder); /* statistics */ show_message(CONSOLE_WHITE, "\n--------------------------------------------------\n"); show_message(CONSOLE_GREEN, "total frames: %d/%d\n", g_frmcount, num_frames); if (inputparam.g_psnr) { if (g_psnrfail == 0 && g_frmcount != 0) { show_message(CONSOLE_GREEN, "average PSNR:\t%8.4f, %8.4f, %8.4f\n\n", g_sum_psnr_y / g_frmcount, g_sum_psnr_u / g_frmcount, g_sum_psnr_v / g_frmcount); sprintf(dst, " Frames: %d/%d\n TIME : %.3lfs, %6.2lf fps\n PSNR : %8.4f, %8.4f, %8.4f\n", g_frmcount, num_frames, (double)((time1 - time0) * f_time_fac), (double)(g_frmcount / ((time1 - time0) * f_time_fac)), g_sum_psnr_y / g_frmcount, g_sum_psnr_u / g_frmcount, g_sum_psnr_v / g_frmcount); } else { show_message(CONSOLE_RED, "average PSNR:\tNaN, \tNaN, \tNaN\n\n"); /* 'NaN' for 'Not a Number' */ } } show_message(CONSOLE_GREEN, "total decoding time: %.3lfs, %6.2lf fps\n", (double)((time1 - time0) * f_time_fac), (double)(g_frmcount / ((time1 - time0) * f_time_fac))); } /* --------------------------------------------------------------------------- */ int main(int argc, char *argv[]) { char dst[1024] = "> no decode data\n"; uint8_t *data = NULL; clock_t tm_start = clock(); int size; int frames; long long filelength; memset(MD5val, 0, 16); memset(MD5str, 0, 33); /* parse params */ if (parse_args(&inputparam, argc, argv) < 0) { sprintf(dst, "Failed to parse input parameters\n"); goto fail; } /* read input data */ if (read_input_file(&inputparam, &data, &size, &frames, 0.0f) < 0) { sprintf(dst, "Failed to read input bit-stream or create output file\n"); goto fail; } /* test decoding */ test_decoder(data, size, frames, dst); show_message(CONSOLE_WHITE, "\n Decoder Total Time: %.3lf s\n", (clock() - tm_start) / (double)(CLOCKS_PER_SEC)); fail: /* tidy up */ if (data) { free(data); } if (g_recbuf) { free(g_recbuf); } if (inputparam.g_infile) { fclose(inputparam.g_infile); } if (inputparam.g_recfile) { fclose(inputparam.g_recfile); } if (inputparam.g_outfile) { fclose(inputparam.g_outfile); } /* calculate MD5 */ if (inputparam.s_md5 && strlen(inputparam.s_md5) == 32) { filelength = FileMD5(inputparam.s_outfile, MD5val); sprintf (MD5str,"%08X%08X%08X%08X", MD5val[0], MD5val[1], MD5val[2], MD5val[3]); if (strcmp(MD5str,inputparam.s_md5)) { show_message(CONSOLE_RED, "\n MD5 match failed\n"); show_message(CONSOLE_WHITE, " Input MD5 : %s \n", inputparam.s_md5); show_message(CONSOLE_WHITE, " Output MD5 : %s \n", MD5str); } else { show_message(CONSOLE_WHITE, "\n MD5 match success \n"); } } show_message(CONSOLE_WHITE, " Decoder Exit, Time: %.3lf s\n", (clock() - tm_start) / (double)(CLOCKS_PER_SEC)); return 0; } #if defined(__cplusplus) } #endif /* __cplusplus */ davs2-1.6/source/test/utils.h000066400000000000000000000140071337322544400162000ustar00rootroot00000000000000/* * utils.h * * Description of this file: * functions definition of the davs2 library * * -------------------------------------------------------------------------- * * davs2 - video decoder of AVS2/IEEE1857.4 video coding standard * Copyright (C) 2018~ VCL, NELVT, Peking University * * Authors: Falei LUO * etc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at sswang @ pku.edu.cn. */ #ifndef DAVS2_UTILS_H #define DAVS2_UTILS_H #include #include #include #include "davs2.h" #define CONSOLE_WHITE 0 #define CONSOLE_YELLOW 1 #define CONSOLE_RED 2 #define CONSOLE_GREEN 3 #if __ANDROID__ #include #include #define LOGE(format,...) __android_log_print(ANDROID_LOG_ERROR,"davs2", format,##__VA_ARGS__) #endif #if _WIN32 #include #include #include #else #include #endif #include /* --------------------------------------------------------------------------- * time */ static __inline int64_t get_time() { #if _WIN32 struct timeb tb; ftime(&tb); return ((int64_t)tb.time * CLOCKS_PER_SEC + (int64_t)tb.millitm); #else struct timeval tv_date; gettimeofday(&tv_date, NULL); return (int64_t)(tv_date.tv_sec * CLOCKS_PER_SEC + (int64_t)tv_date.tv_usec); #endif } #if _WIN32 /* --------------------------------------------------------------------------- */ static __inline void set_font_color(int color) { WORD colors[] = { FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE, FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN, FOREGROUND_INTENSITY | FOREGROUND_RED, FOREGROUND_INTENSITY | FOREGROUND_GREEN, }; SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors[color]); } #endif /* --------------------------------------------------------------------------- */ static void show_message(int color, const char *format, ...) { char message[1024] = { 0 }; va_list arg_ptr; va_start(arg_ptr, format); vsprintf(message, format, arg_ptr); va_end(arg_ptr); #if _WIN32 set_font_color(color); /* set color */ fprintf(stderr, "%s", message); set_font_color(0); /* restore to white color */ #elif __ANDROID__ LOGE("%s", message); #else fprintf(stderr, "%s", message); #endif } /* --------------------------------------------------------------------------- */ static __inline void show_progress(int frame, int frames) { static int64_t first_time = 0; static int64_t last_time = 0; float fps = 0.0f; int64_t total_time = 0; int64_t cur_time = get_time(); int eta; if (first_time == 0) { first_time = cur_time; } else { total_time = cur_time - first_time; fps = frame * 1.0f / total_time * CLOCKS_PER_SEC; } if (cur_time - last_time < 300 && frame != frames) { return; } last_time = cur_time; eta = (int)((frames - frame) * total_time / frame) / (CLOCKS_PER_SEC / 1000); show_message(CONSOLE_WHITE, "\r frames: %4d/%4d, fps: %4.1f, LeftTime: %8.3f sec\r", frame, frames, fps, eta * 0.001); } /* --------------------------------------------------------------------------- */ static void write_frame_plane(FILE *fp_out, const uint8_t *p_src, int img_w, int img_h, int bytes_per_sample, int i_stride) { const int size_line = img_w * bytes_per_sample; int i; for (i = 0; i < img_h; i++) { fwrite(p_src, size_line, 1, fp_out); p_src += i_stride; } } /* --------------------------------------------------------------------------- */ static void write_y4m_header(FILE *fp, int w, int h, int fps_num, int fps_den, int bit_depth) { static int b_y4m_header_write = 0; if (fp != NULL && !b_y4m_header_write) { char buf[64]; if (bit_depth != 8) { sprintf(buf, "YUV4MPEG2 W%d H%d F%d:%d Ip C%sp%d\n", w, h, fps_num, fps_den, "420", bit_depth); fwrite(buf, 1, strlen(buf), fp); } else { sprintf(buf, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n", w, h, fps_num, fps_den, "420"); fwrite(buf, 1, strlen(buf), fp); } b_y4m_header_write = 1; } } /* --------------------------------------------------------------------------- */ static void write_frame(davs2_picture_t *pic, FILE *fp, int b_y4m) { const int bytes_per_sample = pic->bytes_per_sample; if (b_y4m) { const char *s_frm = "FRAME\n"; fwrite(s_frm, 1, strlen(s_frm), fp); } /* write y */ write_frame_plane(fp, pic->planes[0], pic->widths[0], pic->lines[0], bytes_per_sample, pic->strides[0]); if (pic->num_planes == 3) { /* write u */ write_frame_plane(fp, pic->planes[1], pic->widths[1], pic->lines[1], bytes_per_sample, pic->strides[1]); /* write v */ write_frame_plane(fp, pic->planes[2], pic->widths[2], pic->lines[2], bytes_per_sample, pic->strides[2]); } } #endif /// DAVS2_UTILS_H davs2-1.6/version.sh000077500000000000000000000071701337322544400144370ustar00rootroot00000000000000#!/bin/sh # ============================================================================ # File: # version.sh # - get version of repository and generate the file version.h # Author: # Falei LUO # ============================================================================ # setting API version api=`grep '#define DAVS2_BUILD' < ./source/davs2.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/'` VER_R=0 VER_SHA='not-in-git-tree' # get version of remote origin/master and local HEAD if [ -d .git ] && command -v git >/dev/null 2>&1 ; then VER_R=`git rev-list --count origin/master` VER_SHA=`git rev-parse HEAD | cut -c -16` fi # generate version numbers VER_MAJOR=`echo $(($api / 10))` VER_MINOR=`echo $(($api % 10))` # date and time information BUILD_TIME=`date "+%Y-%m-%d %H:%M:%S"` # generate the file version.h echo "// ===========================================================================" > version.h echo "// version.h" >> version.h echo "// - collection of version numbers" >> version.h echo "//" >> version.h echo "// Author: Falei LUO " >> version.h echo "//" >> version.h echo "// ===========================================================================" >> version.h echo "" >> version.h echo "#ifndef DAVS2_VERSION_H" >> version.h echo "#define DAVS2_VERSION_H" >> version.h echo "" >> version.h echo "// version number" >> version.h echo "#define VER_MAJOR $VER_MAJOR // major version number" >> version.h echo "#define VER_MINOR $VER_MINOR // minor version number" >> version.h echo "#define VER_BUILD $VER_R // build number" >> version.h echo "#define VER_SHA_STR \"$VER_SHA\" // commit id" >> version.h echo "" >> version.h echo "// stringify" >> version.h echo "#define _TOSTR(x) #x // stringify x" >> version.h echo "#define TOSTR(x) _TOSTR(x) // stringify x, perform macro expansion" >> version.h echo "" >> version.h echo "// define XVERSION string" >> version.h echo "#define XVERSION VER_MAJOR, VER_MINOR, VER_BUILD" >> version.h echo "#define XVERSION_STR TOSTR(VER_MAJOR) \".\" TOSTR(VER_MINOR) \".\" TOSTR(VER_BUILD) \" \" VER_SHA_STR" >> version.h echo "#define XBUILD_TIME \"$BUILD_TIME\"" >> version.h echo "" >> version.h echo "#endif // DAVS2_VERSION_H" >> version.h mv version.h source/version.h # show version informations echo "#define DAVS2_BUILD $api" echo "#define DAVS2_POINTVER \"$VER_MAJOR.$VER_MINOR.$VER_R\""